fix: recognize words that end in half-width numerals like 小1

As reported here: #709 (comment)
birchill · Oct 24, 2023 · 0107845 · 0107845
1 parent cd77a9e
commit 0107845
Show file tree

Hide file tree

Showing 9 changed files with 127 additions and 49 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ app.
   ([#1314](https://github.com/birchill/10ten-ja-reader/issues/1314)).
 - Make the `x` key close the popup if it is configured to _both_ close it and
   expand it.
+- Fixed recognition of words that end in half-width numerals like 小1.
 - Made the options page show up in a new tab on Edge.
 - Added a workaround for a
   [Firefox bug](https://bugzilla.mozilla.org/show_bug.cgi?id=1860486) that would

diff --git a/src/content/currency.ts b/src/content/currency.ts
@@ -1,7 +1,7 @@
 import {
   getCombinedCharRange,
   getNegatedCharRange,
-  halfWidthNumbers,
+  startsWithNumeral,
 } from '../utils/char-range';
 
 import { parseNumber } from './numbers';
@@ -13,25 +13,33 @@ export type CurrencyMeta = {
 };
 
 export function lookForCurrency({
+  currentText,
   nodeText,
   textDelimiter: originalTextDelimeter,
 }: {
+  currentText: string;
   nodeText: string;
   textDelimiter: RegExp;
 }): {
   textDelimiter: RegExp;
   textEnd: number;
 } | null {
-  // We only need to expand the search range if it starts with a currency
-  // symbol. For the 8千円 case, the regular text lookup will find the necessary
-  // text.
-  if (nodeText.length && nodeText[0] !== '¥' && nodeText[0] !== '￥') {
+  // If the source text might be a currency, expand our text delimeter to allow
+  // extra symbols that would normally be ignored.
+  //
+  // We _could_ just run the `currencyRegex` on the source text but the
+  // following is hopefully a little faster.
+  const sourceText = currentText + nodeText;
+  const mightBeCurrency =
+    sourceText[0] === '¥' ||
+    sourceText[0] === '￥' ||
+    (startsWithNumeral(sourceText) && sourceText.indexOf('円') > 0);
+  if (!mightBeCurrency) {
     return null;
   }
 
   const japaneseOrPrice = getCombinedCharRange([
     getNegatedCharRange(originalTextDelimeter),
-    halfWidthNumbers,
     /[¥￥\s,、.．。]/,
   ]);
   const textDelimiter = getNegatedCharRange(japaneseOrPrice);

diff --git a/src/content/measure.ts b/src/content/measure.ts
@@ -1,7 +1,8 @@
 import {
   getCombinedCharRange,
   getNegatedCharRange,
-  startsWithNumber,
+  startsWithDigit,
+  startsWithNumeral,
 } from '../utils/char-range';
 
 import { parseNumber } from './numbers';
@@ -16,16 +17,14 @@ export function lookForMeasure({
   textDelimiter: RegExp;
   textEnd: number;
 } | null {
-  if (!startsWithNumber(nodeText)) {
+  if (!startsWithNumeral(nodeText)) {
     return null;
   }
 
-  // getTextFromTextNode should already have expanded this range to include
-  // half-width numbers and serparators so we just need to add the units and
-  // space characters.
+  const includeSeparators = startsWithDigit(nodeText);
   const japaneseOrUnit = getCombinedCharRange([
     getNegatedCharRange(originalTextDelimeter),
-    /[\sm2㎡²]/,
+    includeSeparators ? /[\sm2㎡²,、.．]/ : /[\sm2㎡²]/,
   ]);
   const textDelimiter = getNegatedCharRange(japaneseOrUnit);
 

diff --git a/src/content/meta.ts b/src/content/meta.ts
@@ -33,9 +33,9 @@ export function lookForMetadata({
 } {
   return (
     (matchCurrency
-      ? lookForCurrency({ nodeText, textDelimiter })
+      ? lookForCurrency({ currentText, nodeText, textDelimiter })
       : undefined) ||
-    lookForEra({ currentText, nodeText, textEnd }) ||
+    lookForEra({ currentText, nodeText, textEnd, textDelimiter }) ||
     lookForShogi({ nodeText, textDelimiter }) ||
     lookForMeasure({ nodeText, textDelimiter }) || {
       textDelimiter,

diff --git a/src/content/scan-text.ts b/src/content/scan-text.ts
@@ -1,8 +1,4 @@
-import {
-  nonJapaneseChar,
-  nonJapaneseCharOrNumber,
-  startsWithNumber,
-} from '../utils/char-range';
+import { nonJapaneseChar } from '../utils/char-range';
 import { CursorPosition } from './get-cursor-position';
 import { GetTextAtPointResult } from './get-text';
 import { extractGetTextMetadata, lookForMetadata } from './meta';
@@ -92,12 +88,6 @@ export function scanText({
         result.text +
         nodeText.substring(0, textEnd === -1 ? undefined : textEnd);
 
-      // If the source starts with a number, expand our text delimeter to allow
-      // reading the rest of the number since it might be something like 5つ.
-      if (!currentText.length && startsWithNumber(nodeText)) {
-        textDelimiter = nonJapaneseCharOrNumber;
-      }
-
       // Check if we should further expand the set of allowed characters in
       // order to recognize certain types of metadata-type strings (e.g. years
       // or floor space measurements).

diff --git a/src/content/years.ts b/src/content/years.ts
@@ -1,27 +1,39 @@
+import { getCombinedCharRange, getNegatedCharRange } from '../utils/char-range';
 import { parseNumber } from './numbers';
 
-const nonEraCharacter = /[^\s0-9０-９一二三四五六七八九十百元年]/;
-
 export function lookForEra({
   currentText,
   nodeText,
+  textDelimiter: originalTextDelimeter,
   textEnd,
 }: {
   currentText: string;
   nodeText: string;
+  textDelimiter: RegExp;
   textEnd: number;
 }): {
   textDelimiter: RegExp;
   textEnd: number;
 } | null {
+  // We only want to extend the current range so if `textEnd` is already -1
+  // (i.e. end of the text) then we don't need to do anything.
   if (textEnd < 0 || !startsWithEraName(currentText)) {
     return null;
   }
 
-  const endOfEra = nodeText.substring(textEnd).search(nonEraCharacter);
+  // The original text delimter should include all the characters needed to
+  // match japanese years except spaces between the era and the year, and
+  // between the year and the final 年 character, in any.
+  const japaneseOrSpace = getCombinedCharRange([
+    getNegatedCharRange(originalTextDelimeter),
+    /[\s]/,
+  ]);
+  const textDelimiter = getNegatedCharRange(japaneseOrSpace);
+
+  const endOfEra = nodeText.substring(textEnd).search(textDelimiter);
 
   return {
-    textDelimiter: nonEraCharacter,
+    textDelimiter,
     textEnd: endOfEra === -1 ? -1 : textEnd + endOfEra,
   };
 }

diff --git a/src/utils/char-range.ts b/src/utils/char-range.ts
@@ -138,6 +138,8 @@ function isCharacterClassRange(re: RegExp): boolean {
 // typically delimit words.
 /** @public */
 export const japaneseChar = getCombinedCharRange([
+  // We include half-width numbers so we can recognize things like 小1
+  halfWidthNumbers,
   fullWidthAlphanumerics,
   zeroWidthNonJoiner,
   whiteCircle,
@@ -174,18 +176,40 @@ export function getNegatedCharRange(range: RegExp): RegExp {
 
 export const nonJapaneseChar = getNegatedCharRange(japaneseChar);
 
-export const nonJapaneseCharOrNumber = getNegatedCharRange(
-  getCombinedCharRange([japaneseChar, halfWidthNumbers, /[,、.．]/])
-);
-
 export function hasKatakana(text: string): boolean {
   return katakana.test(text);
 }
 
-const numberStartRegex = /^[0-9０-９一二三四五六七八九十百]/;
+export function startsWithDigit(input: string): boolean {
+  const c = input.length ? input.charCodeAt(0) : 0;
+  return (c >= 48 && c <= 57) || (c >= 65296 && c <= 65305);
+}
 
-export function startsWithNumber(input: string): boolean {
-  return !!input.length && numberStartRegex.test(input);
+const kanjiNumerals = [
+  '〇',
+  '一',
+  '二',
+  '三',
+  '四',
+  '五',
+  '六',
+  '七',
+  '八',
+  '九',
+  '十',
+  '百',
+  '千',
+  '万',
+  '億',
+  '兆',
+  '京',
+];
+
+export function startsWithNumeral(input: string): boolean {
+  return (
+    startsWithDigit(input) ||
+    (!!input.length && kanjiNumerals.includes(input[0]))
+  );
 }
 
 const onlyDigits = /^[0-9０-９,，、.．]+$/;

diff --git a/tests/get-text.test.ts b/tests/get-text.test.ts
@@ -419,6 +419,21 @@ describe('getTextAtPoint', () => {
     assertTextResultEqual(result, 'あ\u200cい\u200cう\u200c', [textNode, 0, 6]);
   });
 
+  it('should include trailing half-width numerals', () => {
+    testDiv.append('小1。');
+    const textNode = testDiv.firstChild as Text;
+    const bbox = getBboxForOffset(textNode, 0);
+
+    const result = getTextAtPoint({
+      point: {
+        x: bbox.left,
+        y: bbox.top + bbox.height / 2,
+      },
+    });
+
+    assertTextResultEqual(result, '小1', [textNode, 0, 2]);
+  });
+
   it('should include the year when recognizing years', () => {
     testDiv.append('昭和56年に');
     const textNode = testDiv.firstChild as Text;
@@ -431,7 +446,7 @@ describe('getTextAtPoint', () => {
       },
     });
 
-    assertTextResultEqual(result, '昭和56年', [textNode, 0, 5]);
+    assertTextResultEqual(result, '昭和56年に', [textNode, 0, 6]);
     assert.deepEqual(result!.meta, {
       type: 'era',
       era: '昭和',
@@ -473,7 +488,7 @@ describe('getTextAtPoint', () => {
       },
     });
 
-    assertTextResultEqual(result, '昭和５6年', [textNode, 0, 5]);
+    assertTextResultEqual(result, '昭和５6年に', [textNode, 0, 6]);
     assert.deepEqual(result!.meta, {
       type: 'era',
       era: '昭和',
@@ -495,7 +510,7 @@ describe('getTextAtPoint', () => {
       },
     });
 
-    assertTextResultEqual(result, '昭和 56 年', [textNode, 0, 7]);
+    assertTextResultEqual(result, '昭和 56 年に', [textNode, 0, 8]);
     assert.deepEqual(result!.meta, {
       type: 'era',
       era: '昭和',
@@ -517,7 +532,7 @@ describe('getTextAtPoint', () => {
       },
     });
 
-    assertTextResultEqual(result, '昭和56', [textNode, 0, 4]);
+    assertTextResultEqual(result, '昭和56に', [textNode, 0, 5]);
     assert.deepEqual(result!.meta, {
       type: 'era',
       era: '昭和',
@@ -542,10 +557,10 @@ describe('getTextAtPoint', () => {
 
     assertTextResultEqual(
       result,
-      '昭和56年',
+      '昭和56年に',
       [firstTextNode, 0, 2],
       [middleTextNode, 0, 2],
-      [lastTextNode, 0, 1]
+      [lastTextNode, 0, 2]
     );
     assert.deepEqual(result!.meta, {
       type: 'era',
@@ -570,9 +585,9 @@ describe('getTextAtPoint', () => {
 
     assertTextResultEqual(
       result,
-      '昭和  56年',
+      '昭和  56年に',
       [firstTextNode, 0, 3],
-      [middleTextNode, 0, 4]
+      [middleTextNode, 0, 5]
     );
     assert.deepEqual(result!.meta, {
       type: 'era',
@@ -586,6 +601,7 @@ describe('getTextAtPoint', () => {
     testDiv.innerHTML = '昭和<span>56年</span>に';
     const firstTextNode = testDiv.firstChild as Text;
     const middleTextNode = testDiv.childNodes[1].firstChild as Text;
+    const finalTextNode = testDiv.lastChild as Text;
     const bbox = getBboxForOffset(firstTextNode, 0);
 
     const result = getTextAtPoint({
@@ -597,9 +613,10 @@ describe('getTextAtPoint', () => {
 
     assertTextResultEqual(
       result,
-      '昭和56年',
+      '昭和56年に',
       [firstTextNode, 0, 2],
-      [middleTextNode, 0, 3]
+      [middleTextNode, 0, 3],
+      [finalTextNode, 0, 1]
     );
     assert.deepEqual(result!.meta, {
       type: 'era',
@@ -719,7 +736,7 @@ describe('getTextAtPoint', () => {
     });
   });
 
-  it('should recognize Japanese yen values that start with ￥', () => {
+  it('should recognize Japanese yen values that start with ￥ (full-width)', () => {
     testDiv.append('価格￥8万8千です');
     const textNode = testDiv.firstChild as Text;
     const bbox = getBboxForOffset(textNode, 2);
@@ -739,7 +756,7 @@ describe('getTextAtPoint', () => {
     });
   });
 
-  it('should recognize Japanese yen values that start with ¥', () => {
+  it('should recognize Japanese yen values that start with ¥ (half-width)', () => {
     testDiv.append('価格¥ 8万8千です');
     const textNode = testDiv.firstChild as Text;
     const bbox = getBboxForOffset(textNode, 2);
@@ -759,6 +776,32 @@ describe('getTextAtPoint', () => {
     });
   });
 
+  it('should recognize Japanese yen values that start with ¥ in a separate span', () => {
+    testDiv.innerHTML = '<span>¥</span> 88,000です';
+    const firstTextNode = testDiv.childNodes[0].firstChild as Text;
+    const secondTextNode = testDiv.childNodes[1] as Text;
+    const bbox = getBboxForOffset(firstTextNode, 0);
+
+    const result = getTextAtPoint({
+      point: {
+        x: bbox.left + bbox.width / 2,
+        y: bbox.top + bbox.height / 2,
+      },
+    });
+
+    assertTextResultEqual(
+      result,
+      '¥ 88,000です',
+      [firstTextNode, 0, 1],
+      [secondTextNode, 0, 9]
+    );
+    assert.deepEqual(result!.meta, {
+      type: 'currency',
+      value: 88000,
+      matchLen: 8,
+    });
+  });
+
   it('should recognize Japanese yen values that include commas', () => {
     testDiv.append('価格8,800円です');
     const textNode = testDiv.firstChild as Text;

diff --git a/tests/playground.html b/tests/playground.html
@@ -127,10 +127,11 @@ <h3>Measure tests</h3>
     <li>面積：十二平方米</li>
     <li>8万8千平㍍</li>
   </ul>
-  <h3>Starting number tests</h3>
+  <h3>Starting/end number tests</h3>
   <ul>
     <li>1つ</li>
     <li>14日</li>
+    <li>小1</li>
   </ul>
 
   <h3>Currency tests</h3>