From f4b0e701b23da978e6a6c199ad0add096b23fb6f Mon Sep 17 00:00:00 2001 From: aitelint <53436544+aitelint@users.noreply.github.com> Date: Wed, 13 Jul 2022 12:58:23 +0300 Subject: [PATCH] [EN DateTime V2] Added support for cases like "April ninth through 15th" (#2905) (#2994) * Added support for cases like "April ninth through 15th" (#2905) * Modified fix to use regexes instead of OrdinalExtractor according to review * Removed DateContext modifications * Corrected bug in Hindi Ordinal extraction Co-authored-by: aitelint --- .../English/DateTimeDefinitions.cs | 12 +- .../Hindi/NumbersDefinitions.cs | 2 +- .../Constants.cs | 5 + .../Extractors/BaseDatePeriodExtractor.cs | 4 +- .../Parsers/BaseDatePeriodParser.cs | 44 ++- Patterns/English/English-DateTime.yaml | 26 +- Patterns/Hindi/Hindi-Numbers.yaml | 2 +- Specs/DateTime/English/DateTimeModel.json | 310 ++++++++++++++++++ .../OrdinalModelSuppressExtendedTypes.json | 14 +- 9 files changed, 387 insertions(+), 32 deletions(-) diff --git a/.NET/Microsoft.Recognizers.Definitions.Common/English/DateTimeDefinitions.cs b/.NET/Microsoft.Recognizers.Definitions.Common/English/DateTimeDefinitions.cs index 41419e3646..657e0f7f25 100644 --- a/.NET/Microsoft.Recognizers.Definitions.Common/English/DateTimeDefinitions.cs +++ b/.NET/Microsoft.Recognizers.Definitions.Common/English/DateTimeDefinitions.cs @@ -46,6 +46,10 @@ public static class DateTimeDefinitions public const string WrittenElevenToNineteenRegex = @"(?:eleven|twelve|(?:thir|four|fif|six|seven|eigh|nine)teen)"; public const string WrittenTensRegex = @"(?:ten|twenty|thirty|fou?rty|fifty|sixty|seventy|eighty|ninety)"; public static readonly string WrittenNumRegex = $@"(?:{WrittenOneToNineRegex}|{WrittenElevenToNineteenRegex}|{WrittenTensRegex}(\s+{WrittenOneToNineRegex})?)"; + public const string WrittenOneToNineOrdinalRegex = @"(?:first|second|third|fourth|fifth|sixth|seventh|eighth|nine?th)"; + public const string WrittenTensOrdinalRegex = @"(?:tenth|eleventh|twelfth|thirteenth|fourteenth|fifteenth|sixteenth|seventeenth|eighteenth|nineteenth|twentieth|thirtieth|fortieth|fiftieth|sixtieth|seventieth|eightieth|ninetieth)"; + public static readonly string WrittenOrdinalRegex = $@"(?:{WrittenOneToNineOrdinalRegex}|{WrittenTensOrdinalRegex}|{WrittenTensRegex}\s+{WrittenOneToNineOrdinalRegex})"; + public static readonly string WrittenOrdinalDayRegex = $@"\b(the\s+)?(?(?{WrittenOneToNineOrdinalRegex}|(?:tenth|eleventh|twelfth|thirteenth|fourteenth|fifteenth|sixteenth|seventeenth|eighteenth|nineteenth|twentieth|thirtieth)|(?:ten|twenty)\s+{WrittenOneToNineOrdinalRegex}|thirty\s+first))\b"; public static readonly string WrittenCenturyFullYearRegex = $@"(?:(one|two)\s+thousand((\s+and)?\s+{WrittenOneToNineRegex}\s+hundred)?)"; public const string WrittenCenturyOrdinalYearRegex = @"(?:twenty(\s+(one|two))?|ten|eleven|twelve|thirteen|fifteen|eighteen|(?:four|six|seven|nine)(teen)?|one|two|three|five|eight)"; public static readonly string CenturyRegex = $@"\b(?{WrittenCenturyFullYearRegex}|{WrittenCenturyOrdinalYearRegex}(\s+hundred)?)\b"; @@ -78,10 +82,10 @@ public static class DateTimeDefinitions public const string ToTokenRegex = @"\b(to)$"; public const string FromRegex = @"\b(from(\s+the)?)$"; public const string BetweenTokenRegex = @"\b(between(\s+the)?)$"; - public static readonly string SimpleCasesRegex = $@"\b({RangePrefixRegex}\s+)?({DayRegex})\s*{TillRegex}\s*({DayRegex}\s+{MonthSuffixRegex}|{MonthSuffixRegex}\s+{DayRegex})((\s+|\s*,\s*){YearRegex})?\b"; - public static readonly string MonthFrontSimpleCasesRegex = $@"\b({RangePrefixRegex}\s+)?{MonthSuffixRegex}\s+((from)\s+)?({DayRegex})\s*{TillRegex}\s*({DayRegex})((\s+|\s*,\s*){YearRegex})?\b"; - public static readonly string MonthFrontBetweenRegex = $@"\b{MonthSuffixRegex}\s+(between\s+)({DayRegex})\s*{RangeConnectorRegex}\s*({DayRegex})((\s+|\s*,\s*){YearRegex})?\b"; - public static readonly string BetweenRegex = $@"\b(between\s+)({DayRegex})\s*{RangeConnectorRegex}\s*({DayRegex})\s+{MonthSuffixRegex}((\s+|\s*,\s*){YearRegex})?\b"; + public static readonly string SimpleCasesRegex = $@"\b({RangePrefixRegex}\s+)?({DayRegex}|{WrittenOrdinalDayRegex})\s*{TillRegex}\s*(({DayRegex}|{WrittenOrdinalDayRegex})\s+{MonthSuffixRegex}|{MonthSuffixRegex}\s+({DayRegex}|{WrittenOrdinalDayRegex}))((\s+|\s*,\s*){YearRegex})?\b"; + public static readonly string MonthFrontSimpleCasesRegex = $@"\b({RangePrefixRegex}\s+)?{MonthSuffixRegex}\s+((from)\s+)?({DayRegex}|{WrittenOrdinalDayRegex})\s*{TillRegex}\s*({DayRegex}|{WrittenOrdinalDayRegex})((\s+|\s*,\s*){YearRegex})?\b"; + public static readonly string MonthFrontBetweenRegex = $@"\b{MonthSuffixRegex}\s+(between\s+)({DayRegex}|{WrittenOrdinalDayRegex})\s*{RangeConnectorRegex}\s*({DayRegex}|{WrittenOrdinalDayRegex})((\s+|\s*,\s*){YearRegex})?\b"; + public static readonly string BetweenRegex = $@"\b(between\s+)({DayRegex}|{WrittenOrdinalDayRegex})\s*{RangeConnectorRegex}\s*({DayRegex}|{WrittenOrdinalDayRegex})\s+{MonthSuffixRegex}((\s+|\s*,\s*){YearRegex})?\b"; public static readonly string MonthWithYear = $@"\b((({WrittenMonthRegex}[\.]?|((the\s+)?(?first|1st|second|2nd|third|3rd|fourth|4th|fifth|5th|sixth|6th|seventh|7th|eighth|8th|ninth|9th|tenth|10th|eleventh|11th|twelfth|12th|last)\s+month(?=\s+(of|in))))((\s*)[/\\\-\.,]?(\s+(of|in))?(\s*)({YearRegex}|(?following|next|last|this)\s+year)|\s+(of|in)\s+{TwoDigitYearRegex}))|(({YearRegex}|(?following|next|last|this)\s+year)(\s*),?(\s*){WrittenMonthRegex}))\b"; public const string SpecialYearPrefixes = @"(calendar|(?fiscal|school))"; public static readonly string OneWordPeriodRegex = $@"\b((((the\s+)?month of\s+)?({StrictRelativeRegex}\s+)?{MonthRegex})|(month|year) to date|(?((un)?till?|to)\s+date)|({RelativeRegex}\s+)?(my\s+)?((?working\s+week|workweek)|week(end)?|month|fortnight|(({SpecialYearPrefixes}\s+)?year))(?!((\s+of)?\s+\d+(?!({BaseDateTime.BaseAmDescRegex}|{BaseDateTime.BasePmDescRegex}))|\s+to\s+date))(\s+{AfterNextSuffixRegex})?)\b"; diff --git a/.NET/Microsoft.Recognizers.Definitions.Common/Hindi/NumbersDefinitions.cs b/.NET/Microsoft.Recognizers.Definitions.Common/Hindi/NumbersDefinitions.cs index dfbbe8cd35..511dcb6ead 100644 --- a/.NET/Microsoft.Recognizers.Definitions.Common/Hindi/NumbersDefinitions.cs +++ b/.NET/Microsoft.Recognizers.Definitions.Common/Hindi/NumbersDefinitions.cs @@ -70,7 +70,7 @@ public static class NumbersDefinitions public const string DecimalUnitsRegex = @"(?:डेढ़|डेढ़|डेढ|ढाई|सवा|सावा)"; public static readonly string DecimalUnitsWithRoundNumberRegex = $@"({DecimalUnitsRegex}\s+({{AllNumericalIntRegex}}\s+)?{RoundNumberIntegerRegex}|{DecimalUnitsRegex})"; public const string RoundNumberOrdinalRegex = @"(?:(सौ|हजार|हज़ार|लाख|करोड़|अरब|खरब)(वां|वीं|वें|वाँ))"; - public const string OneToNineOrdinalRegex = @"(?:पहला|पहले|पहली|तीसरे|प्रथम|दूसरा|दूसरी|दूसरे|तिहाई|चौथाई|((पांच|पाँच|छठ|सात|आठ|नौ)(वां|वीं|वें|वाँ|वा)))"; + public const string OneToNineOrdinalRegex = @"(?:पहला|(? ExtractImpl(string text, DateObject reference) tokens.AddRange(MergeTwoTimePoints(text, reference)); tokens.AddRange(MatchDuration(text, reference)); - tokens.AddRange(SingleTimePointWithPatterns(text, new List(ordinalExtractions), reference)); + tokens.AddRange(SingleTimePointWithPatterns(text, ordinalExtractions, reference)); tokens.AddRange(MatchComplexCases(text, simpleCasesResults, reference)); tokens.AddRange(MatchYearPeriod(text, reference)); - tokens.AddRange(MatchOrdinalNumberWithCenturySuffix(text, new List(ordinalExtractions))); + tokens.AddRange(MatchOrdinalNumberWithCenturySuffix(text, ordinalExtractions)); return Token.MergeAllTokens(tokens, text, ExtractorName); } diff --git a/.NET/Microsoft.Recognizers.Text.DateTime/Parsers/BaseDatePeriodParser.cs b/.NET/Microsoft.Recognizers.Text.DateTime/Parsers/BaseDatePeriodParser.cs index 691106a87f..94a759c053 100644 --- a/.NET/Microsoft.Recognizers.Text.DateTime/Parsers/BaseDatePeriodParser.cs +++ b/.NET/Microsoft.Recognizers.Text.DateTime/Parsers/BaseDatePeriodParser.cs @@ -699,9 +699,47 @@ private DateTimeResolutionResult ParseSimpleCases(string text, DateObject refere if (match.Success) { - var days = match.Groups["day"]; - beginDay = this.config.DayOfMonth[days.Captures[0].Value]; - endDay = this.config.DayOfMonth[days.Captures[1].Value]; + var days = match.Groups[Constants.DayGroupName]; + var writtenDay = match.Groups[Constants.OrdinalGroupName]; + if (writtenDay.Captures.Count > 0 && days.Captures[0].Value == writtenDay.Captures[0].Value) + { + // Parse beginDay in written form + var dayMatch = writtenDay.Captures[0]; + var dayEr = new ExtractResult + { + Start = dayMatch.Index, + Length = dayMatch.Length, + Text = dayMatch.Value, + Type = Constants.SYS_NUMBER_ORDINAL, + Metadata = new Metadata { IsOrdinalRelative = false, }, + }; + var dayPr = this.config.NumberParser.Parse(dayEr); + beginDay = (int)(double)dayPr.Value; + } + else + { + beginDay = this.config.DayOfMonth[days.Captures[0].Value]; + } + + if (writtenDay.Captures.Count > 0 && days.Captures[1].Value == writtenDay.Captures[writtenDay.Captures.Count - 1].Value) + { + // Parse endDay in written form + var dayMatch = writtenDay.Captures[writtenDay.Captures.Count - 1]; + var dayEr = new ExtractResult + { + Start = dayMatch.Index, + Length = dayMatch.Length, + Text = dayMatch.Value, + Type = Constants.SYS_NUMBER_ORDINAL, + Metadata = new Metadata { IsOrdinalRelative = false, }, + }; + var dayPr = this.config.NumberParser.Parse(dayEr); + endDay = (int)(double)dayPr.Value; + } + else + { + endDay = this.config.DayOfMonth[days.Captures[1].Value]; + } // parse year year = config.DateExtractor.GetYearFromText(match.Match); diff --git a/Patterns/English/English-DateTime.yaml b/Patterns/English/English-DateTime.yaml index ac97a16f85..2b4fc3aa42 100644 --- a/Patterns/English/English-DateTime.yaml +++ b/Patterns/English/English-DateTime.yaml @@ -60,6 +60,16 @@ WrittenTensRegex: !simpleRegex WrittenNumRegex: !nestedRegex def: (?:{WrittenOneToNineRegex}|{WrittenElevenToNineteenRegex}|{WrittenTensRegex}(\s+{WrittenOneToNineRegex})?) references: [ WrittenOneToNineRegex, WrittenElevenToNineteenRegex, WrittenTensRegex ] +WrittenOneToNineOrdinalRegex: !simpleRegex + def: (?:first|second|third|fourth|fifth|sixth|seventh|eighth|nine?th) +WrittenTensOrdinalRegex: !simpleRegex + def: (?:tenth|eleventh|twelfth|thirteenth|fourteenth|fifteenth|sixteenth|seventeenth|eighteenth|nineteenth|twentieth|thirtieth|fortieth|fiftieth|sixtieth|seventieth|eightieth|ninetieth) +WrittenOrdinalRegex: !nestedRegex + def: (?:{WrittenOneToNineOrdinalRegex}|{WrittenTensOrdinalRegex}|{WrittenTensRegex}\s+{WrittenOneToNineOrdinalRegex}) + references: [ WrittenOneToNineOrdinalRegex, WrittenTensOrdinalRegex, WrittenTensRegex ] +WrittenOrdinalDayRegex: !nestedRegex + def: \b(the\s+)?(?(?{WrittenOneToNineOrdinalRegex}|(?:tenth|eleventh|twelfth|thirteenth|fourteenth|fifteenth|sixteenth|seventeenth|eighteenth|nineteenth|twentieth|thirtieth)|(?:ten|twenty)\s+{WrittenOneToNineOrdinalRegex}|thirty\s+first))\b + references: [ WrittenOneToNineOrdinalRegex ] WrittenCenturyFullYearRegex: !nestedRegex def: (?:(one|two)\s+thousand((\s+and)?\s+{WrittenOneToNineRegex}\s+hundred)?) references: [ WrittenOneToNineRegex] @@ -137,17 +147,17 @@ FromRegex: !simpleRegex BetweenTokenRegex: !simpleRegex def: \b(between(\s+the)?)$ SimpleCasesRegex: !nestedRegex - def: \b({RangePrefixRegex}\s+)?({DayRegex})\s*{TillRegex}\s*({DayRegex}\s+{MonthSuffixRegex}|{MonthSuffixRegex}\s+{DayRegex})((\s+|\s*,\s*){YearRegex})?\b - references: [ DayRegex, TillRegex, MonthSuffixRegex, YearRegex, RangePrefixRegex ] + def: \b({RangePrefixRegex}\s+)?({DayRegex}|{WrittenOrdinalDayRegex})\s*{TillRegex}\s*(({DayRegex}|{WrittenOrdinalDayRegex})\s+{MonthSuffixRegex}|{MonthSuffixRegex}\s+({DayRegex}|{WrittenOrdinalDayRegex}))((\s+|\s*,\s*){YearRegex})?\b + references: [ DayRegex, TillRegex, MonthSuffixRegex, YearRegex, RangePrefixRegex, WrittenOrdinalDayRegex ] MonthFrontSimpleCasesRegex: !nestedRegex - def: \b({RangePrefixRegex}\s+)?{MonthSuffixRegex}\s+((from)\s+)?({DayRegex})\s*{TillRegex}\s*({DayRegex})((\s+|\s*,\s*){YearRegex})?\b - references: [ MonthSuffixRegex, DayRegex, TillRegex, YearRegex, RangePrefixRegex ] + def: \b({RangePrefixRegex}\s+)?{MonthSuffixRegex}\s+((from)\s+)?({DayRegex}|{WrittenOrdinalDayRegex})\s*{TillRegex}\s*({DayRegex}|{WrittenOrdinalDayRegex})((\s+|\s*,\s*){YearRegex})?\b + references: [ MonthSuffixRegex, DayRegex, TillRegex, YearRegex, RangePrefixRegex, WrittenOrdinalDayRegex ] MonthFrontBetweenRegex: !nestedRegex - def: \b{MonthSuffixRegex}\s+(between\s+)({DayRegex})\s*{RangeConnectorRegex}\s*({DayRegex})((\s+|\s*,\s*){YearRegex})?\b - references: [ MonthSuffixRegex, DayRegex, RangeConnectorRegex , YearRegex ] + def: \b{MonthSuffixRegex}\s+(between\s+)({DayRegex}|{WrittenOrdinalDayRegex})\s*{RangeConnectorRegex}\s*({DayRegex}|{WrittenOrdinalDayRegex})((\s+|\s*,\s*){YearRegex})?\b + references: [ MonthSuffixRegex, DayRegex, RangeConnectorRegex , YearRegex, WrittenOrdinalDayRegex ] BetweenRegex: !nestedRegex - def: \b(between\s+)({DayRegex})\s*{RangeConnectorRegex}\s*({DayRegex})\s+{MonthSuffixRegex}((\s+|\s*,\s*){YearRegex})?\b - references: [ DayRegex, RangeConnectorRegex , MonthSuffixRegex, YearRegex ] + def: \b(between\s+)({DayRegex}|{WrittenOrdinalDayRegex})\s*{RangeConnectorRegex}\s*({DayRegex}|{WrittenOrdinalDayRegex})\s+{MonthSuffixRegex}((\s+|\s*,\s*){YearRegex})?\b + references: [ DayRegex, RangeConnectorRegex , MonthSuffixRegex, YearRegex, WrittenOrdinalDayRegex ] MonthWithYear: !nestedRegex def: \b((({WrittenMonthRegex}[\.]?|((the\s+)?(?first|1st|second|2nd|third|3rd|fourth|4th|fifth|5th|sixth|6th|seventh|7th|eighth|8th|ninth|9th|tenth|10th|eleventh|11th|twelfth|12th|last)\s+month(?=\s+(of|in))))((\s*)[/\\\-\.,]?(\s+(of|in))?(\s*)({YearRegex}|(?following|next|last|this)\s+year)|\s+(of|in)\s+{TwoDigitYearRegex}))|(({YearRegex}|(?following|next|last|this)\s+year)(\s*),?(\s*){WrittenMonthRegex}))\b references: [ WrittenMonthRegex, YearRegex, TwoDigitYearRegex ] diff --git a/Patterns/Hindi/Hindi-Numbers.yaml b/Patterns/Hindi/Hindi-Numbers.yaml index 56603011d2..48b199ce4d 100644 --- a/Patterns/Hindi/Hindi-Numbers.yaml +++ b/Patterns/Hindi/Hindi-Numbers.yaml @@ -125,7 +125,7 @@ DecimalUnitsWithRoundNumberRegex: !nestedRegex RoundNumberOrdinalRegex: !simpleRegex def: (?:(सौ|हजार|हज़ार|लाख|करोड़|अरब|खरब)(वां|वीं|वें|वाँ)) OneToNineOrdinalRegex: !simpleRegex - def: (?:पहला|पहले|पहली|तीसरे|प्रथम|दूसरा|दूसरी|दूसरे|तिहाई|चौथाई|((पांच|पाँच|छठ|सात|आठ|नौ)(वां|वीं|वें|वाँ|वा))) + def: (?:पहला|(?