fix: recognize words that end in half-width numerals like 小1

As reported here: #709 (comment)
birchill · Oct 24, 2023 · 4538ebf · 4538ebf
1 parent cd77a9e
commit 4538ebf
Show file tree

Hide file tree

Showing 10 changed files with 138 additions and 63 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ app.
   ([#1314](https://github.com/birchill/10ten-ja-reader/issues/1314)).
 - Make the `x` key close the popup if it is configured to _both_ close it and
   expand it.
+- Fixed recognition of words that end in half-width numerals like 小1.
 - Made the options page show up in a new tab on Edge.
 - Added a workaround for a
   [Firefox bug](https://bugzilla.mozilla.org/show_bug.cgi?id=1860486) that would

diff --git a/src/content/currency.ts b/src/content/currency.ts
@@ -1,7 +1,7 @@
 import {
   getCombinedCharRange,
   getNegatedCharRange,
-  halfWidthNumbers,
+  startsWithNumeral,
 } from '../utils/char-range';
 
 import { parseNumber } from './numbers';
@@ -13,25 +13,30 @@ export type CurrencyMeta = {
 };
 
 export function lookForCurrency({
+  currentText,
   nodeText,
-  textDelimiter: originalTextDelimeter,
+  textDelimiter: originalTextDelimiter,
 }: {
+  currentText: string;
   nodeText: string;
   textDelimiter: RegExp;
 }): {
   textDelimiter: RegExp;
   textEnd: number;
 } | null {
-  // We only need to expand the search range if it starts with a currency
-  // symbol. For the 8千円 case, the regular text lookup will find the necessary
-  // text.
-  if (nodeText.length && nodeText[0] !== '¥' && nodeText[0] !== '￥') {
+  // If the source text might be a currency, expand our text delimiter to allow
+  // extra symbols that would normally be ignored.
+  const sourceText = currentText + nodeText;
+  const mightBeCurrency =
+    sourceText[0] === '¥' ||
+    sourceText[0] === '￥' ||
+    (startsWithNumeral(sourceText) && sourceText.indexOf('円') > 0);
+  if (!mightBeCurrency) {
     return null;
   }
 
   const japaneseOrPrice = getCombinedCharRange([
-    getNegatedCharRange(originalTextDelimeter),
-    halfWidthNumbers,
+    getNegatedCharRange(originalTextDelimiter),
     /[¥￥\s,、.．。]/,
   ]);
   const textDelimiter = getNegatedCharRange(japaneseOrPrice);

diff --git a/src/content/measure.ts b/src/content/measure.ts
@@ -1,31 +1,30 @@
 import {
   getCombinedCharRange,
   getNegatedCharRange,
-  startsWithNumber,
+  startsWithDigit,
+  startsWithNumeral,
 } from '../utils/char-range';
 
 import { parseNumber } from './numbers';
 
 export function lookForMeasure({
   nodeText,
-  textDelimiter: originalTextDelimeter,
+  textDelimiter: originalTextDelimiter,
 }: {
   nodeText: string;
   textDelimiter: RegExp;
 }): {
   textDelimiter: RegExp;
   textEnd: number;
 } | null {
-  if (!startsWithNumber(nodeText)) {
+  if (!startsWithNumeral(nodeText)) {
     return null;
   }
 
-  // getTextFromTextNode should already have expanded this range to include
-  // half-width numbers and serparators so we just need to add the units and
-  // space characters.
+  const includeSeparators = startsWithDigit(nodeText);
   const japaneseOrUnit = getCombinedCharRange([
-    getNegatedCharRange(originalTextDelimeter),
-    /[\sm2㎡²]/,
+    getNegatedCharRange(originalTextDelimiter),
+    includeSeparators ? /[\sm2㎡²,、.．]/ : /[\sm2㎡²]/,
   ]);
   const textDelimiter = getNegatedCharRange(japaneseOrUnit);
 

diff --git a/src/content/meta.ts b/src/content/meta.ts
@@ -33,9 +33,9 @@ export function lookForMetadata({
 } {
   return (
     (matchCurrency
-      ? lookForCurrency({ nodeText, textDelimiter })
+      ? lookForCurrency({ currentText, nodeText, textDelimiter })
       : undefined) ||
-    lookForEra({ currentText, nodeText, textEnd }) ||
+    lookForEra({ currentText, nodeText, textEnd, textDelimiter }) ||
     lookForShogi({ nodeText, textDelimiter }) ||
     lookForMeasure({ nodeText, textDelimiter }) || {
       textDelimiter,

diff --git a/src/content/scan-text.ts b/src/content/scan-text.ts
@@ -1,8 +1,4 @@
-import {
-  nonJapaneseChar,
-  nonJapaneseCharOrNumber,
-  startsWithNumber,
-} from '../utils/char-range';
+import { nonJapaneseChar } from '../utils/char-range';
 import { CursorPosition } from './get-cursor-position';
 import { GetTextAtPointResult } from './get-text';
 import { extractGetTextMetadata, lookForMetadata } from './meta';
@@ -92,12 +88,6 @@ export function scanText({
         result.text +
         nodeText.substring(0, textEnd === -1 ? undefined : textEnd);
 
-      // If the source starts with a number, expand our text delimeter to allow
-      // reading the rest of the number since it might be something like 5つ.
-      if (!currentText.length && startsWithNumber(nodeText)) {
-        textDelimiter = nonJapaneseCharOrNumber;
-      }
-
       // Check if we should further expand the set of allowed characters in
       // order to recognize certain types of metadata-type strings (e.g. years
       // or floor space measurements).

diff --git a/src/content/shogi.ts b/src/content/shogi.ts
@@ -66,7 +66,7 @@ export type ShogiMeta = {
 
 export function lookForShogi({
   nodeText,
-  textDelimiter: originalTextDelimeter,
+  textDelimiter: originalTextDelimiter,
 }: {
   nodeText: string;
   textDelimiter: RegExp;
@@ -79,28 +79,28 @@ export function lookForShogi({
   }
 
   // If the test starts with one of the shogi side indicators, then we assume
-  // that the text is a shogi move and we can use the shogi delimeter.
+  // that the text is a shogi move and we can use the shogi delimiter.
   if (['▲', '△', '☗', '☖'].includes(nodeText[0])) {
     return {
-      textDelimiter: shogiDelimeter,
-      textEnd: nodeText.search(shogiDelimeter),
+      textDelimiter: shogiDelimiter,
+      textEnd: nodeText.search(shogiDelimiter),
     };
   }
 
   // Otherwise, if it starts with an Arabic number followed by a kanji number
   // OR it starts with one of the characters meaning "same position" then
-  // expand the delimeter range to include all the shogi characters.
+  // expand the delimiter range to include all the shogi characters.
   if (!unprefixedShogiStart.test(nodeText)) {
     return null;
   }
 
-  const expandedDelimeter = getCombinedCharRange([
-    getNegatedCharRange(originalTextDelimeter),
+  const expandedDelimiter = getCombinedCharRange([
+    getNegatedCharRange(originalTextDelimiter),
     /[↑]/,
     // All the other characters such as 𠔼丶フゝ・○ etc. should already be
     // covered by `japaneseChar` so we don't need to add them here.
   ]);
-  const textDelimiter = getNegatedCharRange(expandedDelimeter);
+  const textDelimiter = getNegatedCharRange(expandedDelimiter);
 
   return {
     textDelimiter,
@@ -109,7 +109,7 @@ export function lookForShogi({
 }
 
 // This needs to be kept in sync with the regexes below.
-const shogiDelimeter =
+const shogiDelimiter =
   /[^▲△☗☖1-9１-９一二三四五六七八九同仝－𠔼ド歩兵丶フゝ・香禾キ↑桂土銀ヨ角ク飛ヒ乙金人と成ナ馬マウ龍竜立リ玉王○打引寄上行入右左直行入不生]/u;
 const unprefixedShogiStart = /^[1-9１-９][一二三四五六七八九]|[同仝－𠔼ド]/u;
 

diff --git a/src/content/years.ts b/src/content/years.ts
@@ -1,27 +1,39 @@
+import { getCombinedCharRange, getNegatedCharRange } from '../utils/char-range';
 import { parseNumber } from './numbers';
 
-const nonEraCharacter = /[^\s0-9０-９一二三四五六七八九十百元年]/;
-
 export function lookForEra({
   currentText,
   nodeText,
+  textDelimiter: originalTextDelimiter,
   textEnd,
 }: {
   currentText: string;
   nodeText: string;
+  textDelimiter: RegExp;
   textEnd: number;
 }): {
   textDelimiter: RegExp;
   textEnd: number;
 } | null {
+  // We only want to _extend_ the current range so if `textEnd` is already -1
+  // (i.e. end of the text) then we don't need to do anything.
   if (textEnd < 0 || !startsWithEraName(currentText)) {
     return null;
   }
 
-  const endOfEra = nodeText.substring(textEnd).search(nonEraCharacter);
+  // The original text delimiter should include all the characters needed to
+  // match Japanese years except spaces between the era and the year, and
+  // spaces between the year and the final 年 character, if any.
+  const japaneseOrSpace = getCombinedCharRange([
+    getNegatedCharRange(originalTextDelimiter),
+    /[\s]/,
+  ]);
+  const textDelimiter = getNegatedCharRange(japaneseOrSpace);
+
+  const endOfEra = nodeText.substring(textEnd).search(textDelimiter);
 
   return {
-    textDelimiter: nonEraCharacter,
+    textDelimiter,
     textEnd: endOfEra === -1 ? -1 : textEnd + endOfEra,
   };
 }

diff --git a/src/utils/char-range.ts b/src/utils/char-range.ts
@@ -138,6 +138,8 @@ function isCharacterClassRange(re: RegExp): boolean {
 // typically delimit words.
 /** @public */
 export const japaneseChar = getCombinedCharRange([
+  // We include half-width numbers so we can recognize things like 小1
+  halfWidthNumbers,
   fullWidthAlphanumerics,
   zeroWidthNonJoiner,
   whiteCircle,
@@ -174,18 +176,40 @@ export function getNegatedCharRange(range: RegExp): RegExp {
 
 export const nonJapaneseChar = getNegatedCharRange(japaneseChar);
 
-export const nonJapaneseCharOrNumber = getNegatedCharRange(
-  getCombinedCharRange([japaneseChar, halfWidthNumbers, /[,、.．]/])
-);
-
 export function hasKatakana(text: string): boolean {
   return katakana.test(text);
 }
 
-const numberStartRegex = /^[0-9０-９一二三四五六七八九十百]/;
+export function startsWithDigit(input: string): boolean {
+  const c = input.length ? input.charCodeAt(0) : 0;
+  return (c >= 48 && c <= 57) || (c >= 65296 && c <= 65305);
+}
 
-export function startsWithNumber(input: string): boolean {
-  return !!input.length && numberStartRegex.test(input);
+const kanjiNumerals = [
+  '〇',
+  '一',
+  '二',
+  '三',
+  '四',
+  '五',
+  '六',
+  '七',
+  '八',
+  '九',
+  '十',
+  '百',
+  '千',
+  '万',
+  '億',
+  '兆',
+  '京',
+];
+
+export function startsWithNumeral(input: string): boolean {
+  return (
+    startsWithDigit(input) ||
+    (!!input.length && kanjiNumerals.includes(input[0]))
+  );
 }
 
 const onlyDigits = /^[0-9０-９,，、.．]+$/;