Skip to content

Commit

Permalink
fix: reflect source kana usage when displaying matches
Browse files Browse the repository at this point in the history
Fixes #978.

Specifically when looking up words like バカ売れ and ガタガタ we would
prioritize the hiragana version, sometimes even hiding the katakana
version altogether.

This is because, when doing a search on the IDB database, we first
normalize the text to hiragana.

Now, the IDB database itself is able to look up kana variants but I
believe we were doing the hiragana conversion separately to that so we
can do deinflection using normalized hiragana.

This patch moves the normalization to the point where we do deinflection
so that we preserve the source kana form and jpdict-idb can use that to
more accurately set the match labels on the different headwords.

As a result, cases like サボった now correctly show サボる as the match
while cases that use katakana in the deinflected part should still work.
  • Loading branch information
birtles committed Aug 10, 2022
1 parent 6ba0597 commit f7d9c4d
Show file tree
Hide file tree
Showing 6 changed files with 89 additions and 60 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ app.

## [Unreleased]

- Prioritize katakana headwords when matching on katakana
([#978](https://github.com/birchill/10ten-ja-reader/issues/978)).
- Fixed display of currency conversion etc. when there are only name results
([#970](https://github.com/birchill/10ten-ja-reader/issues/970)).

Expand Down
20 changes: 20 additions & 0 deletions src/background/deinflect.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,26 @@ describe('deinflect', () => {
});
});

it('deinflects kana variations', () => {
const cases = [
['走ります', '走る', [[DeinflectReason.Polite]], 2],
['走りまス', '走る', [[DeinflectReason.Polite]], 2],
['走りマス', '走る', [[DeinflectReason.Polite]], 2],
['走リマス', '走る', [[DeinflectReason.Polite]], 2],
['走リマす', '走る', [[DeinflectReason.Polite]], 2],
['走った', '走る', [[DeinflectReason.Past]], 2],
['走っタ', '走る', [[DeinflectReason.Past]], 2],
['走ッタ', '走る', [[DeinflectReason.Past]], 2],
['走ッた', '走る', [[DeinflectReason.Past]], 2],
];

for (const [inflected, plain, reasons, type] of cases) {
const result = deinflect(inflected as string);
const match = result.find((candidate) => candidate.word == plain);
expect(match).toMatchObject({ reasons, type, word: plain });
}
});

it('deinflects -masu stem forms', () => {
const result = deinflect('食べ');
const match = result.find((candidate) => candidate.word === '食べる');
Expand Down
113 changes: 62 additions & 51 deletions src/background/deinflect.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import { kanaToHiragana } from '@birchill/normal-jp';

export const enum DeinflectReason {
PolitePastNegative,
PoliteNegative,
Expand Down Expand Up @@ -718,63 +720,72 @@ export function deinflect(word: string): CandidateWord[] {
const type = thisCandidate.type;

for (const ruleGroup of ruleGroups) {
if (ruleGroup.fromLen <= word.length) {
const ending = word.substr(-ruleGroup.fromLen);
if (ruleGroup.fromLen > word.length) {
continue;
}

const ending = word.slice(-ruleGroup.fromLen);
const hiraganaEnding = kanaToHiragana(ending);

for (const rule of ruleGroup.rules) {
if (type & rule.type && ending === rule.from) {
const newWord =
word.substr(0, word.length - rule.from.length) + rule.to;
if (newWord.length <= 1) {
continue;
}
for (const rule of ruleGroup.rules) {
if (!(type & rule.type)) {
continue;
}

// If we already have a candidate for this word with the same
// to type(s), expand the possible reasons.
//
// If the to type(s) differ, then we'll add a separate candidate
// and just hope that when we go to match against dictionary words
// we'll filter out the mismatching one(s).
if (resultIndex[newWord]) {
const candidate = result[resultIndex[newWord]];
if (candidate.type === rule.type >> 8) {
candidate.reasons.unshift([rule.reason]);
continue;
}
}
resultIndex[newWord] = result.length;
if (ending !== rule.from && hiraganaEnding !== rule.from) {
continue;
}

// Deep clone multidimensional array
const reasons = [];
for (const array of thisCandidate.reasons) {
reasons.push(Array.from(array));
}
if (reasons.length) {
const firstReason = reasons[0];
// This is a bit hacky but the alternative is to add the
// full-form causative passive inflections to the deinflection
// dictionary and then try to merge the results.
if (
rule.reason === DeinflectReason.Causative &&
firstReason.length &&
firstReason[0] === DeinflectReason.PotentialOrPassive
) {
firstReason.splice(0, 1, DeinflectReason.CausativePassive);
} else {
firstReason.unshift(rule.reason);
}
} else {
reasons.push([rule.reason]);
}
const candidate: CandidateWord = {
reasons,
type: rule.type >> 8,
word: newWord,
};
const newWord =
word.substring(0, word.length - rule.from.length) + rule.to;
if (newWord.length <= 1) {
continue;
}

result.push(candidate);
// If we already have a candidate for this word with the same
// to type(s), expand the possible reasons.
//
// If the to type(s) differ, then we'll add a separate candidate
// and just hope that when we go to match against dictionary words
// we'll filter out the mismatching one(s).
if (resultIndex[newWord]) {
const candidate = result[resultIndex[newWord]];
if (candidate.type === rule.type >> 8) {
candidate.reasons.unshift([rule.reason]);
continue;
}
}
resultIndex[newWord] = result.length;

// Deep clone multidimensional array
const reasons = [];
for (const array of thisCandidate.reasons) {
reasons.push(Array.from(array));
}
if (reasons.length) {
const firstReason = reasons[0];
// This is a bit hacky but the alternative is to add the
// full-form causative passive inflections to the deinflection
// dictionary and then try to merge the results.
if (
rule.reason === DeinflectReason.Causative &&
firstReason.length &&
firstReason[0] === DeinflectReason.PotentialOrPassive
) {
firstReason.splice(0, 1, DeinflectReason.CausativePassive);
} else {
firstReason.unshift(rule.reason);
}
} else {
reasons.push([rule.reason]);
}
const candidate: CandidateWord = {
reasons,
type: rule.type >> 8,
word: newWord,
};

result.push(candidate);
}
}
} while (++i < result.length);
Expand Down
4 changes: 1 addition & 3 deletions src/background/jpdict.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import {
UpdateErrorState,
UpdateState,
} from '@birchill/jpdict-idb';
import { kanaToHiragana } from '@birchill/normal-jp';
import { browser } from 'webextension-polyfill-ts';

import { ExtensionStorageError } from '../common/extension-storage-error';
Expand Down Expand Up @@ -329,8 +328,7 @@ export async function searchWords({
dbStatus: 'updating' | 'unavailable' | undefined
]
> {
let [word, inputLengths] = normalizeInput(input);
word = kanaToHiragana(word);
const [word, inputLengths] = normalizeInput(input);

const maxResults =
max > 0 ? Math.min(WORDS_MAX_ENTRIES, max) : WORDS_MAX_ENTRIES;
Expand Down
4 changes: 2 additions & 2 deletions src/background/word-search.ts
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ export async function wordSearch({
continue;
}

// Now that we have filtered our set of matches to those we plan to keep
// Now that we have filtered our set of matches to those we plan to keep,
// update our duplicates set.
have = new Set([...have, ...wordResults.map((word) => word.id)]);

Expand Down Expand Up @@ -123,7 +123,7 @@ export async function wordSearch({

// Shorten input, but don't split a ようおん (e.g. きゃ).
const lengthToShorten = endsInYoon(input) ? 2 : 1;
input = input.substr(0, input.length - lengthToShorten);
input = input.substring(0, input.length - lengthToShorten);
}

if (!result.data.length) {
Expand Down
6 changes: 2 additions & 4 deletions src/utils/normalize-input.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@ export function normalizeInput(input: string): [string, number[]] {
// Convert to full-width, normalize decomposed characters, expand combined
// characters etc.
const fullWidthInput = halfToFullWidthNum(input);
let [normalized, inputLengths] = toNormalized(
halfToFullWidthNum(fullWidthInput)
);
let [normalized, inputLengths] = toNormalized(fullWidthInput);

// Strip out any zero-width non-joiners (as Google Docs sometimes likes to
// stick them between every single character).
Expand All @@ -28,7 +26,7 @@ export function normalizeInput(input: string): [string, number[]] {
while (inputLengths[outputIndex] < i) {
outputIndex++;
}
normalized = normalized.substr(0, outputIndex);
normalized = normalized.substring(0, outputIndex);
inputLengths = inputLengths.slice(0, outputIndex ? outputIndex + 1 : 0);
break;
}
Expand Down

0 comments on commit f7d9c4d

Please sign in to comment.