Skip to content

Commit

Permalink
fix: recognize words that end in half-width numerals like 小1
Browse files Browse the repository at this point in the history
As reported here:
#709 (comment)
  • Loading branch information
birtles committed Oct 24, 2023
1 parent cd77a9e commit 0107845
Show file tree
Hide file tree
Showing 9 changed files with 127 additions and 49 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ app.
([#1314](https://github.com/birchill/10ten-ja-reader/issues/1314)).
- Make the `x` key close the popup if it is configured to _both_ close it and
expand it.
- Fixed recognition of words that end in half-width numerals like 小1.
- Made the options page show up in a new tab on Edge.
- Added a workaround for a
[Firefox bug](https://bugzilla.mozilla.org/show_bug.cgi?id=1860486) that would
Expand Down
20 changes: 14 additions & 6 deletions src/content/currency.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import {
getCombinedCharRange,
getNegatedCharRange,
halfWidthNumbers,
startsWithNumeral,
} from '../utils/char-range';

import { parseNumber } from './numbers';
Expand All @@ -13,25 +13,33 @@ export type CurrencyMeta = {
};

export function lookForCurrency({
currentText,
nodeText,
textDelimiter: originalTextDelimeter,
}: {
currentText: string;
nodeText: string;
textDelimiter: RegExp;
}): {
textDelimiter: RegExp;
textEnd: number;
} | null {
// We only need to expand the search range if it starts with a currency
// symbol. For the 8千円 case, the regular text lookup will find the necessary
// text.
if (nodeText.length && nodeText[0] !== '¥' && nodeText[0] !== '¥') {
// If the source text might be a currency, expand our text delimeter to allow
// extra symbols that would normally be ignored.
//
// We _could_ just run the `currencyRegex` on the source text but the
// following is hopefully a little faster.
const sourceText = currentText + nodeText;
const mightBeCurrency =
sourceText[0] === '¥' ||
sourceText[0] === '¥' ||
(startsWithNumeral(sourceText) && sourceText.indexOf('円') > 0);
if (!mightBeCurrency) {
return null;
}

const japaneseOrPrice = getCombinedCharRange([
getNegatedCharRange(originalTextDelimeter),
halfWidthNumbers,
/[¥\s,.]/,
]);
const textDelimiter = getNegatedCharRange(japaneseOrPrice);
Expand Down
11 changes: 5 additions & 6 deletions src/content/measure.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import {
getCombinedCharRange,
getNegatedCharRange,
startsWithNumber,
startsWithDigit,
startsWithNumeral,
} from '../utils/char-range';

import { parseNumber } from './numbers';
Expand All @@ -16,16 +17,14 @@ export function lookForMeasure({
textDelimiter: RegExp;
textEnd: number;
} | null {
if (!startsWithNumber(nodeText)) {
if (!startsWithNumeral(nodeText)) {
return null;
}

// getTextFromTextNode should already have expanded this range to include
// half-width numbers and serparators so we just need to add the units and
// space characters.
const includeSeparators = startsWithDigit(nodeText);
const japaneseOrUnit = getCombinedCharRange([
getNegatedCharRange(originalTextDelimeter),
/[\sm2²]/,
includeSeparators ? /[\sm2²,.]/ : /[\sm2²]/,
]);
const textDelimiter = getNegatedCharRange(japaneseOrUnit);

Expand Down
4 changes: 2 additions & 2 deletions src/content/meta.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ export function lookForMetadata({
} {
return (
(matchCurrency
? lookForCurrency({ nodeText, textDelimiter })
? lookForCurrency({ currentText, nodeText, textDelimiter })
: undefined) ||
lookForEra({ currentText, nodeText, textEnd }) ||
lookForEra({ currentText, nodeText, textEnd, textDelimiter }) ||
lookForShogi({ nodeText, textDelimiter }) ||
lookForMeasure({ nodeText, textDelimiter }) || {
textDelimiter,
Expand Down
12 changes: 1 addition & 11 deletions src/content/scan-text.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
import {
nonJapaneseChar,
nonJapaneseCharOrNumber,
startsWithNumber,
} from '../utils/char-range';
import { nonJapaneseChar } from '../utils/char-range';
import { CursorPosition } from './get-cursor-position';
import { GetTextAtPointResult } from './get-text';
import { extractGetTextMetadata, lookForMetadata } from './meta';
Expand Down Expand Up @@ -92,12 +88,6 @@ export function scanText({
result.text +
nodeText.substring(0, textEnd === -1 ? undefined : textEnd);

// If the source starts with a number, expand our text delimeter to allow
// reading the rest of the number since it might be something like 5つ.
if (!currentText.length && startsWithNumber(nodeText)) {
textDelimiter = nonJapaneseCharOrNumber;
}

// Check if we should further expand the set of allowed characters in
// order to recognize certain types of metadata-type strings (e.g. years
// or floor space measurements).
Expand Down
20 changes: 16 additions & 4 deletions src/content/years.ts
Original file line number Diff line number Diff line change
@@ -1,27 +1,39 @@
import { getCombinedCharRange, getNegatedCharRange } from '../utils/char-range';
import { parseNumber } from './numbers';

const nonEraCharacter = /[^\s0-9-]/;

export function lookForEra({
currentText,
nodeText,
textDelimiter: originalTextDelimeter,
textEnd,
}: {
currentText: string;
nodeText: string;
textDelimiter: RegExp;
textEnd: number;
}): {
textDelimiter: RegExp;
textEnd: number;
} | null {
// We only want to extend the current range so if `textEnd` is already -1
// (i.e. end of the text) then we don't need to do anything.
if (textEnd < 0 || !startsWithEraName(currentText)) {
return null;
}

const endOfEra = nodeText.substring(textEnd).search(nonEraCharacter);
// The original text delimter should include all the characters needed to
// match japanese years except spaces between the era and the year, and
// between the year and the final 年 character, in any.
const japaneseOrSpace = getCombinedCharRange([
getNegatedCharRange(originalTextDelimeter),
/[\s]/,
]);
const textDelimiter = getNegatedCharRange(japaneseOrSpace);

const endOfEra = nodeText.substring(textEnd).search(textDelimiter);

return {
textDelimiter: nonEraCharacter,
textDelimiter,
textEnd: endOfEra === -1 ? -1 : textEnd + endOfEra,
};
}
Expand Down
38 changes: 31 additions & 7 deletions src/utils/char-range.ts
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ function isCharacterClassRange(re: RegExp): boolean {
// typically delimit words.
/** @public */
export const japaneseChar = getCombinedCharRange([
// We include half-width numbers so we can recognize things like 小1
halfWidthNumbers,
fullWidthAlphanumerics,
zeroWidthNonJoiner,
whiteCircle,
Expand Down Expand Up @@ -174,18 +176,40 @@ export function getNegatedCharRange(range: RegExp): RegExp {

export const nonJapaneseChar = getNegatedCharRange(japaneseChar);

export const nonJapaneseCharOrNumber = getNegatedCharRange(
getCombinedCharRange([japaneseChar, halfWidthNumbers, /[,.]/])
);

export function hasKatakana(text: string): boolean {
return katakana.test(text);
}

const numberStartRegex = /^[0-9-]/;
export function startsWithDigit(input: string): boolean {
const c = input.length ? input.charCodeAt(0) : 0;
return (c >= 48 && c <= 57) || (c >= 65296 && c <= 65305);
}

export function startsWithNumber(input: string): boolean {
return !!input.length && numberStartRegex.test(input);
const kanjiNumerals = [
'〇',
'一',
'二',
'三',
'四',
'五',
'六',
'七',
'八',
'九',
'十',
'百',
'千',
'万',
'億',
'兆',
'京',
];

export function startsWithNumeral(input: string): boolean {
return (
startsWithDigit(input) ||
(!!input.length && kanjiNumerals.includes(input[0]))
);
}

const onlyDigits = /^[0-9-,.]+$/;
Expand Down
67 changes: 55 additions & 12 deletions tests/get-text.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,21 @@ describe('getTextAtPoint', () => {
assertTextResultEqual(result, 'あ\u200cい\u200cう\u200c', [textNode, 0, 6]);
});

it('should include trailing half-width numerals', () => {
testDiv.append('小1。');
const textNode = testDiv.firstChild as Text;
const bbox = getBboxForOffset(textNode, 0);

const result = getTextAtPoint({
point: {
x: bbox.left,
y: bbox.top + bbox.height / 2,
},
});

assertTextResultEqual(result, '小1', [textNode, 0, 2]);
});

it('should include the year when recognizing years', () => {
testDiv.append('昭和56年に');
const textNode = testDiv.firstChild as Text;
Expand All @@ -431,7 +446,7 @@ describe('getTextAtPoint', () => {
},
});

assertTextResultEqual(result, '昭和56年', [textNode, 0, 5]);
assertTextResultEqual(result, '昭和56年に', [textNode, 0, 6]);
assert.deepEqual(result!.meta, {
type: 'era',
era: '昭和',
Expand Down Expand Up @@ -473,7 +488,7 @@ describe('getTextAtPoint', () => {
},
});

assertTextResultEqual(result, '昭和56年', [textNode, 0, 5]);
assertTextResultEqual(result, '昭和56年に', [textNode, 0, 6]);
assert.deepEqual(result!.meta, {
type: 'era',
era: '昭和',
Expand All @@ -495,7 +510,7 @@ describe('getTextAtPoint', () => {
},
});

assertTextResultEqual(result, '昭和 56 ', [textNode, 0, 7]);
assertTextResultEqual(result, '昭和 56 年に', [textNode, 0, 8]);
assert.deepEqual(result!.meta, {
type: 'era',
era: '昭和',
Expand All @@ -517,7 +532,7 @@ describe('getTextAtPoint', () => {
},
});

assertTextResultEqual(result, '昭和56', [textNode, 0, 4]);
assertTextResultEqual(result, '昭和56に', [textNode, 0, 5]);
assert.deepEqual(result!.meta, {
type: 'era',
era: '昭和',
Expand All @@ -542,10 +557,10 @@ describe('getTextAtPoint', () => {

assertTextResultEqual(
result,
'昭和56年',
'昭和56年に',
[firstTextNode, 0, 2],
[middleTextNode, 0, 2],
[lastTextNode, 0, 1]
[lastTextNode, 0, 2]
);
assert.deepEqual(result!.meta, {
type: 'era',
Expand All @@ -570,9 +585,9 @@ describe('getTextAtPoint', () => {

assertTextResultEqual(
result,
'昭和 56年',
'昭和 56年に',
[firstTextNode, 0, 3],
[middleTextNode, 0, 4]
[middleTextNode, 0, 5]
);
assert.deepEqual(result!.meta, {
type: 'era',
Expand All @@ -586,6 +601,7 @@ describe('getTextAtPoint', () => {
testDiv.innerHTML = '昭和<span>56年</span>に';
const firstTextNode = testDiv.firstChild as Text;
const middleTextNode = testDiv.childNodes[1].firstChild as Text;
const finalTextNode = testDiv.lastChild as Text;
const bbox = getBboxForOffset(firstTextNode, 0);

const result = getTextAtPoint({
Expand All @@ -597,9 +613,10 @@ describe('getTextAtPoint', () => {

assertTextResultEqual(
result,
'昭和56年',
'昭和56年に',
[firstTextNode, 0, 2],
[middleTextNode, 0, 3]
[middleTextNode, 0, 3],
[finalTextNode, 0, 1]
);
assert.deepEqual(result!.meta, {
type: 'era',
Expand Down Expand Up @@ -719,7 +736,7 @@ describe('getTextAtPoint', () => {
});
});

it('should recognize Japanese yen values that start with ¥', () => {
it('should recognize Japanese yen values that start with ¥ (full-width)', () => {
testDiv.append('価格¥8万8千です');
const textNode = testDiv.firstChild as Text;
const bbox = getBboxForOffset(textNode, 2);
Expand All @@ -739,7 +756,7 @@ describe('getTextAtPoint', () => {
});
});

it('should recognize Japanese yen values that start with ¥', () => {
it('should recognize Japanese yen values that start with ¥ (half-width)', () => {
testDiv.append('価格¥ 8万8千です');
const textNode = testDiv.firstChild as Text;
const bbox = getBboxForOffset(textNode, 2);
Expand All @@ -759,6 +776,32 @@ describe('getTextAtPoint', () => {
});
});

it('should recognize Japanese yen values that start with ¥ in a separate span', () => {
testDiv.innerHTML = '<span>¥</span> 88,000です';
const firstTextNode = testDiv.childNodes[0].firstChild as Text;
const secondTextNode = testDiv.childNodes[1] as Text;
const bbox = getBboxForOffset(firstTextNode, 0);

const result = getTextAtPoint({
point: {
x: bbox.left + bbox.width / 2,
y: bbox.top + bbox.height / 2,
},
});

assertTextResultEqual(
result,
'¥ 88,000です',
[firstTextNode, 0, 1],
[secondTextNode, 0, 9]
);
assert.deepEqual(result!.meta, {
type: 'currency',
value: 88000,
matchLen: 8,
});
});

it('should recognize Japanese yen values that include commas', () => {
testDiv.append('価格8,800円です');
const textNode = testDiv.firstChild as Text;
Expand Down
3 changes: 2 additions & 1 deletion tests/playground.html
Original file line number Diff line number Diff line change
Expand Up @@ -127,10 +127,11 @@ <h3>Measure tests</h3>
<li>面積:十二平方米</li>
<li>8万8千平㍍</li>
</ul>
<h3>Starting number tests</h3>
<h3>Starting/end number tests</h3>
<ul>
<li>1つ</li>
<li>14日</li>
<li>小1</li>
</ul>

<h3>Currency tests</h3>
Expand Down

0 comments on commit 0107845

Please sign in to comment.