diff --git a/data_lang/htm8.py b/data_lang/htm8.py index b64e69c8d..f9571556d 100644 --- a/data_lang/htm8.py +++ b/data_lang/htm8.py @@ -2,15 +2,20 @@ TODO -Migrate: - -- maybe: migrate everything off of TagLexer() - - and AttrValueLexer() - this should requires Validate() +- would be nice: migrate everything off of TagLexer() + - oils_doc.py and help_gen.py + - this old API is stateful and uses Python iterators, which is problematic + - maybe we can use a better CSS selector abstraction API: - Get rid of Reset()? -- Deprecate tag_lexer.GetTagName() in favor of lx.CanonicalTagName() or - _LiteralTagName() +- Validate() can be improved + +Features: + +- work on ToXml() test cases? This is another text of AttrLexer + +C++: - UTF-8 check, like JSON8 - re2c - port lexer, which will fix static typing issues @@ -475,6 +480,7 @@ def __init__(self, s): self.name_start = -1 self.name_end = -1 + self.equal_end = -1 self.next_value_is_missing = False self.init_t = -1 @@ -523,7 +529,7 @@ def Reset(self): self.pos = self.init_t def ReadName(self): - # type: () -> Tuple[attr_name_t, int, int] + # type: () -> Tuple[attr_name_t, int, int, int] """Reads the attribute name EOF case: @@ -541,7 +547,7 @@ def ReadName(self): #log('ReadName() tag_name_pos %d pos, %d %s', self.tag_name_pos, self.pos, m.groups()) if a == attr_name.Invalid: #log('m.groups %s', m.groups()) - return attr_name.Invalid, -1, -1 + return attr_name.Invalid, -1, -1, -1 self.pos = m.end(0) # Advance if it's not invalid @@ -549,6 +555,7 @@ def ReadName(self): #log('%r', m.groups()) self.name_start = m.start(1) self.name_end = m.end(1) + self.equal_end = m.end(0) # XML conversion needs this # Is the equals sign missing? Set state. if m.group(2) is None: self.next_value_is_missing = True @@ -556,14 +563,14 @@ def ReadName(self): self.pos = self.name_end else: self.next_value_is_missing = False - return attr_name.Ok, self.name_start, self.name_end + return attr_name.Ok, self.name_start, self.name_end, self.equal_end else: # Reset state - e.g. you must call AttrNameEquals self.name_start = -1 self.name_end = -1 if a == attr_name.Done: - return attr_name.Done, -1, -1 + return attr_name.Done, -1, -1, -1 else: context = self.s[self.pos:] #log('s %r %d', self.s, self.pos) @@ -692,7 +699,7 @@ def ReadValue(self, tokens_out=None): def GetAttrRaw(attr_lx, name): # type: (AttrLexer, str) -> Optional[str] while True: - n, name_start, name_end = attr_lx.ReadName() + n, name_start, name_end, _ = attr_lx.ReadName() #log('==> ReadName %s %d %d', attr_name_str(n), name_start, name_end) if n == attr_name.Ok: if attr_lx.AttrNameEquals(name): @@ -714,10 +721,10 @@ def GetAttrRaw(attr_lx, name): def AllAttrsRawSlice(attr_lx): - # type: (AttrLexer) -> List[Tuple[int, int, attr_value_t, int, int]] + # type: (AttrLexer) -> List[Tuple[int, int, int, attr_value_t, int, int]] result = [] while True: - n, name_start, name_end = attr_lx.ReadName() + n, name_start, name_end, equal_end = attr_lx.ReadName() if 0: log(' AllAttrsRaw ==> ReadName %s %d %d %r', attr_name_str(n), name_start, name_end, attr_lx.s[attr_lx.pos:attr_lx.pos + 10]) @@ -728,7 +735,8 @@ def AllAttrsRawSlice(attr_lx): v, val_start, val_end = attr_lx.ReadValue() #val = attr_lx.s[val_start:val_end] #log(' ReadValue %r', val) - result.append((name_start, name_end, v, val_start, val_end)) + result.append( + (name_start, name_end, equal_end, v, val_start, val_end)) elif n == attr_name.Done: break elif n == attr_name.Invalid: @@ -751,7 +759,7 @@ def AllAttrsRaw(attr_lx): slices = AllAttrsRawSlice(attr_lx) pairs = [] s = attr_lx.s - for name_start, name_end, val_id, val_start, val_end in slices: + for name_start, name_end, equal_end, val_id, val_start, val_end in slices: n = s[name_start:name_end] v = s[val_start:val_end] pairs.append((n, v)) diff --git a/data_lang/htm8_test.py b/data_lang/htm8_test.py index 5996a8e93..66d74e532 100755 --- a/data_lang/htm8_test.py +++ b/data_lang/htm8_test.py @@ -62,7 +62,7 @@ def testNoAttrs(self): attr_lx.Init(tok_id, lx.TagNamePos(), end_pos) # There is no tag - n, name_start, name_end = attr_lx.ReadName() + n, name_start, name_end, _ = attr_lx.ReadName() self.assertEqual(n, attr_name.Done) self.assertEqual(-1, name_start) self.assertEqual(-1, name_end) @@ -85,7 +85,7 @@ def testInvalid(self): h = '' attr_lx = _MakeAttrLexer(self, h) - n, name_start, name_end = attr_lx.ReadName() + n, name_start, name_end, _ = attr_lx.ReadName() self.assertEqual(n, attr_name.Invalid) self.assertEqual(-1, name_start) self.assertEqual(-1, name_end) @@ -101,7 +101,7 @@ def testEmpty(self): h = '' attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag) - n, name_start, name_end = attr_lx.ReadName() + n, name_start, name_end, _ = attr_lx.ReadName() self.assertEqual(n, attr_name.Ok) self.assertEqual(5, name_start) self.assertEqual(8, name_end) @@ -116,14 +116,14 @@ def testEmpty(self): self.assertEqual(-1, attr_start) self.assertEqual(-1, attr_end) - n, name_start, name_end = attr_lx.ReadName() + n, name_start, name_end, _ = attr_lx.ReadName() self.assertEqual(n, attr_name.Done) def testMissing(self): h = '' attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag) - n, name_start, name_end = attr_lx.ReadName() + n, name_start, name_end, _ = attr_lx.ReadName() self.assertEqual(n, attr_name.Ok) self.assertEqual(5, name_start) self.assertEqual(8, name_end) @@ -137,7 +137,7 @@ def testMissing(self): self.assertEqual(-1, attr_start) self.assertEqual(-1, attr_end) - n, name_start, name_end = attr_lx.ReadName() + n, name_start, name_end, _ = attr_lx.ReadName() self.assertEqual(n, attr_name.Done) def testUnquoted(self): @@ -145,7 +145,7 @@ def testUnquoted(self): h = '' attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartEndTag) - n, name_start, name_end = attr_lx.ReadName() + n, name_start, name_end, _ = attr_lx.ReadName() self.assertEqual(n, attr_name.Ok) self.assertEqual(3, name_start) self.assertEqual(4, name_end) @@ -159,14 +159,14 @@ def testUnquoted(self): self.assertEqual(5, attr_start) self.assertEqual(8, attr_end) - n, name_start, name_end = attr_lx.ReadName() + n, name_start, name_end, _ = attr_lx.ReadName() self.assertEqual(n, attr_name.Done) def testDoubleQuoted(self): h = '' attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag) - n, name_start, name_end = attr_lx.ReadName() + n, name_start, name_end, _ = attr_lx.ReadName() self.assertEqual(n, attr_name.Ok) self.assertEqual(3, name_start) self.assertEqual(4, name_end) @@ -181,7 +181,7 @@ def testDoubleQuoted(self): self.assertEqual(8, attr_end) self.assertEqual(9, attr_lx.pos) - n, name_start, name_end = attr_lx.ReadName() + n, name_start, name_end, _ = attr_lx.ReadName() log('n = %r', attr_name_str(n)) self.assertEqual(n, attr_name.Done) @@ -189,7 +189,7 @@ def testSingleQuoted(self): h = "" attr_lx = _MakeAttrLexer(self, h, expected_tag=h8_id.StartTag) - n, name_start, name_end = attr_lx.ReadName() + n, name_start, name_end, _ = attr_lx.ReadName() self.assertEqual(n, attr_name.Ok) self.assertEqual(3, name_start) self.assertEqual(4, name_end) @@ -204,7 +204,7 @@ def testSingleQuoted(self): self.assertEqual(8, attr_end) self.assertEqual(9, attr_lx.pos) - n, name_start, name_end = attr_lx.ReadName() + n, name_start, name_end, _ = attr_lx.ReadName() #log('n = %r', attr_name_str(n)) self.assertEqual(n, attr_name.Done) @@ -212,7 +212,7 @@ def testDoubleQuoted_Bad(self): h = ' + out.PrintUntil(equal_end) + out.Print('""') + elif v == attr_value_e.Unquoted: # + # Because we disallow ", we can just surround with quotes + out.PrintUntil(val_start) + out.Print('"') + out.PrintUntil(val_end) + out.Print('"') + #val_lexer.Reset(val_start, val_end) pass # TODO: get the kind of string diff --git a/data_lang/htm8_util_test.py b/data_lang/htm8_util_test.py index 3fc80eab5..fd9a92a85 100755 --- a/data_lang/htm8_util_test.py +++ b/data_lang/htm8_util_test.py @@ -55,7 +55,7 @@ def testValid(self): VALID_LEX = [ # TODO: convert these to XML ('', UNCHANGED), - ('', ''), + ('', ''), #('', ''), ('', ''), @@ -63,9 +63,9 @@ def testValid(self): ('

x & y

', '

x & y

'), # No ambiguity - ('', ''), - ('', ''), - ('', ''), + ('', ''), + ('', UNCHANGED), + ('', ''), ] INVALID_PARSE = [ @@ -112,16 +112,16 @@ def testValid(self): ('
', ''), # no attribute - ('', ''), - ('', ''), - ('', ''), + ('', ''), + ('', ''), + ('', ''), # single quoted is pretty common ("", ''), # Conceding to reality - I used these myself - ('', ''), - ('', ''), + ('', ''), + ('', ''), ('', ''), # caps