Merge pull request #101 from ipld/codectools-tokenizers

Fresh take on codec APIs, and some tokenization utilities.
ipld · Nov 14, 2020 · 624fae0 · 624fae0
2 parents 35ad3e3 + 1110155
commit 624fae0
Show file tree

Hide file tree

Showing 8 changed files with 873 additions and 0 deletions.
diff --git a/codec/api.go b/codec/api.go
@@ -0,0 +1,50 @@
+package codec
+
+import (
+	"io"
+
+	"github.com/ipld/go-ipld-prime"
+)
+
+// Encoder is the essential definition of a function that takes IPLD Data Model data in memory and serializes it.
+// IPLD Codecs are written by implementing this function interface (as well as (typically) a matched Decoder).
+//
+// Encoder functions can be composed into an ipld.LinkSystem to provide
+// a "one stop shop" API for handling content addressable storage.
+// Encoder functions can also be used directly if you want to handle serial data streams.
+//
+// Most codec packages will have a ReusableEncoder type
+// (which contains any working memory needed by the encoder implementation,
+// as well as any configuration options),
+// and that type will have an Encode function matching this interface.
+//
+// By convention, codec packages that have a multicodec contract will also have
+// a package-scope exported function called Encode which also matches this interface,
+// and is the equivalent of creating a zero-value ReusableEncoder (aka, default config)
+// and using its Encode method.
+// This package-scope function will typically also internally use a sync.Pool
+// to keep some ReusableEncoder values on hand to avoid unnecesary allocations.
+//
+// Note that a ReusableEncoder type that supports configuration options
+// does not functionally expose those options when invoked by the multicodec system --
+// multicodec indicators do not provide room for extended configuration info.
+// Codecs that expose configuration options are doing so for library users to enjoy;
+// it does not mean those non-default configurations will necessarly be available
+// in all scenarios that use codecs indirectly.
+// There is also no standard interface for such configurations: by nature,
+// if they exist at all, they vary per codec.
+type Encoder func(data ipld.Node, output io.Writer) error
+
+// Decoder is the essential definiton of a function that consumes serial data and unfurls it into IPLD Data Model-compatible in-memory representations.
+// IPLD Codecs are written by implementing this function interface (as well as (typically) a matched Encoder).
+//
+// Decoder is the dual of Encoder.
+// Most of the documentation for the Encoder function interface
+// also applies wholesale to the Decoder interface.
+type Decoder func(into ipld.NodeAssembler, input io.Reader) error
+
+type ErrBudgetExhausted struct{}
+
+func (e ErrBudgetExhausted) Error() string {
+	return "decoder resource budget exhausted (message too long or too complex)"
+}
diff --git a/codec/codectools/token.go b/codec/codectools/token.go
@@ -0,0 +1,84 @@
+package codectools
+
+import (
+	"fmt"
+
+	"github.com/ipld/go-ipld-prime"
+)
+
+type Token struct {
+	Kind TokenKind
+
+	Length int       // Present for MapOpen or ListOpen.  May be -1 for "unknown" (e.g. a json tokenizer will yield this).
+	Bool   bool      // Value.  Union: only has meaning if Kind is TokenKind_Bool.
+	Int    int64     // Value.  Union: only has meaning if Kind is TokenKind_Int.
+	Float  float64   // Value.  Union: only has meaning if Kind is TokenKind_Float.
+	Str    string    // Value.  Union: only has meaning if Kind is TokenKind_String.  ('Str' rather than 'String' to avoid collision with method.)
+	Bytes  []byte    // Value.  Union: only has meaning if Kind is TokenKind_Bytes.
+	Link   ipld.Link // Value.  Union: only has meaning if Kind is TokenKind_Link.
+
+	Node ipld.Node // Direct pointer to the original data, if this token is used to communicate data during a walk of existing in-memory data.  Absent when token is being used during deserialization.
+
+	// The following fields all track position and progress:
+	// (These may be useful to copy into any error messages if errors arise.)
+	// (Implementations may assume token reuse and treat these as state keeping;
+	// you may experience position accounting accuracy problems if *not* reusing tokens or if zeroing these fields.)
+
+	pth          []ipld.PathSegment // Set by token producers (whether marshallers or deserializers) to track logical position.
+	offset       int64              // Set by deserializers (for both textual or binary formats alike) to track progress.
+	lineOffset   int64              // Set by deserializers that work with textual data.  May be ignored by binary deserializers.
+	columnOffset int64              // Set by deserializers that work with textual data.  May be ignored by binary deserializers.
+}
+
+func (tk Token) String() string {
+	switch tk.Kind {
+	case TokenKind_MapOpen:
+		return fmt.Sprintf("<%c:%d>", tk.Kind, tk.Length)
+	case TokenKind_MapClose:
+		return fmt.Sprintf("<%c>", tk.Kind)
+	case TokenKind_ListOpen:
+		return fmt.Sprintf("<%c:%d>", tk.Kind, tk.Length)
+	case TokenKind_ListClose:
+		return fmt.Sprintf("<%c>", tk.Kind)
+	case TokenKind_Null:
+		return fmt.Sprintf("<%c>", tk.Kind)
+	case TokenKind_Bool:
+		return fmt.Sprintf("<%c:%v>", tk.Kind, tk.Bool)
+	case TokenKind_Int:
+		return fmt.Sprintf("<%c:%v>", tk.Kind, tk.Int)
+	case TokenKind_Float:
+		return fmt.Sprintf("<%c:%v>", tk.Kind, tk.Float)
+	case TokenKind_String:
+		return fmt.Sprintf("<%c:%q>", tk.Kind, tk.Str)
+	case TokenKind_Bytes:
+		return fmt.Sprintf("<%c:%x>", tk.Kind, tk.Bytes)
+	case TokenKind_Link:
+		return fmt.Sprintf("<%c:%v>", tk.Kind, tk.Link)
+	default:
+		return "<INVALID>"
+	}
+}
+
+type TokenKind uint8
+
+const (
+	TokenKind_MapOpen   TokenKind = '{'
+	TokenKind_MapClose  TokenKind = '}'
+	TokenKind_ListOpen  TokenKind = '['
+	TokenKind_ListClose TokenKind = ']'
+	TokenKind_Null      TokenKind = '0'
+	TokenKind_Bool      TokenKind = 'b'
+	TokenKind_Int       TokenKind = 'i'
+	TokenKind_Float     TokenKind = 'f'
+	TokenKind_String    TokenKind = 's'
+	TokenKind_Bytes     TokenKind = 'x'
+	TokenKind_Link      TokenKind = '/'
+)
+
+type ErrMalformedTokenSequence struct {
+	Detail string
+}
+
+func (e ErrMalformedTokenSequence) Error() string {
+	return "malformed token sequence: " + e.Detail
+}
diff --git a/codec/codectools/token_consumers.go b/codec/codectools/token_consumers.go
@@ -0,0 +1,258 @@
+package codectools
+
+import (
+	"fmt"
+	"io"
+
+	"github.com/ipld/go-ipld-prime"
+	"github.com/ipld/go-ipld-prime/codec"
+)
+
+// TokenAssemble takes an ipld.NodeAssembler and a TokenReader,
+// and repeatedly pumps the TokenReader for tokens and feeds their data into the ipld.NodeAssembler
+// until it finishes a complete value.
+//
+// To compare and contrast to other token oriented tools:
+// TokenAssemble does the same direction of information transfer as the TokenAssembler gadget does,
+// but TokenAssemble moves completely through a value in one step,
+// whereas the TokenAssembler accepts tokens pumped into it one step at a time.
+//
+// TokenAssemble does not enforce the "map keys must be strings" rule which is present in the Data Model;
+// it will also happily do even recursive structures in map keys,
+// meaning it can be used when handling schema values like maps with complex keys.
+func TokenAssemble(na ipld.NodeAssembler, tr TokenReader, budget int) error {
+	tk, err := tr(&budget)
+	if err != nil {
+		return err
+	}
+	return tokenAssemble(na, tk, tr, &budget)
+}
+
+func tokenAssemble(na ipld.NodeAssembler, tk *Token, tr TokenReader, budget *int) error {
+	if *budget < 0 {
+		return codec.ErrBudgetExhausted{}
+	}
+	switch tk.Kind {
+	case TokenKind_MapOpen:
+		if tk.Length > 0 && *budget < tk.Length*2 { // Pre-check budget: at least two decrements estimated for each entry.
+			return codec.ErrBudgetExhausted{}
+		}
+		ma, err := na.BeginMap(tk.Length)
+		if err != nil {
+			return err
+		}
+		for {
+			// Peek one token.  We need to see if the map is about to end or not.
+			tk, err = tr(budget)
+			if err != nil {
+				return err
+			}
+			// If the map has ended, invoke the finish operation and check for any errors.
+			if tk.Kind == TokenKind_MapClose {
+				return ma.Finish()
+			}
+			// Recurse to assemble the key.
+			*budget-- // Decrement budget by at least one for each key.  The key content may also cause further decrements.
+			if err = tokenAssemble(ma.AssembleKey(), tk, tr, budget); err != nil {
+				return err
+			}
+			// Recurse to assemble the value.
+			//  (We don't really care to peek this token, but do so anyway to keep the calling convention regular.)
+			tk, err = tr(budget)
+			if err != nil {
+				return err
+			}
+			*budget-- // Decrement budget by at least one for each value.  The value content may also cause further decrements.
+			if err = tokenAssemble(ma.AssembleValue(), tk, tr, budget); err != nil {
+				return err
+			}
+			// Continue around the loop, to encounter either the next entry or the end of the map.
+		}
+	case TokenKind_MapClose:
+		return ErrMalformedTokenSequence{"map close token encountered while not in the middle of a map"}
+	case TokenKind_ListOpen:
+		if tk.Length > 0 && *budget < tk.Length { // Pre-check budget: at least one decrement estimated for each entry.
+			return codec.ErrBudgetExhausted{}
+		}
+		la, err := na.BeginList(tk.Length)
+		if err != nil {
+			return err
+		}
+		for {
+			// Peek one token.  We need to see if the list is about to end or not.
+			tk, err = tr(budget)
+			if err != nil {
+				return err
+			}
+			// If the list has ended, invoke the finish operation and check for any errors.
+			if tk.Kind == TokenKind_ListClose {
+				return la.Finish()
+			}
+			// Recurse to assemble the value.
+			*budget-- // Decrement budget by at least one for each value.  The value content may also cause further decrements.
+			if err = tokenAssemble(la.AssembleValue(), tk, tr, budget); err != nil {
+				return err
+			}
+			// Continue around the loop, to encounter either the next value or the end of the list.
+		}
+	case TokenKind_ListClose:
+		return ErrMalformedTokenSequence{"list close token encountered while not in the middle of a list"}
+	case TokenKind_Null:
+		return na.AssignNull()
+	case TokenKind_Bool:
+		*budget--
+		return na.AssignBool(tk.Bool)
+	case TokenKind_Int:
+		*budget--
+		return na.AssignInt(int(tk.Int))
+	case TokenKind_Float:
+		*budget--
+		return na.AssignFloat(tk.Float)
+	case TokenKind_String:
+		*budget -= len(tk.Str)
+		return na.AssignString(tk.Str)
+	case TokenKind_Bytes:
+		*budget -= len(tk.Bytes)
+		return na.AssignBytes(tk.Bytes)
+	case TokenKind_Link:
+		*budget--
+		return na.AssignLink(tk.Link)
+	default:
+		panic(fmt.Errorf("unrecognized token kind (%q?)", tk.Kind))
+	}
+}
+
+// --- the stepwise assembler system (more complicated; has a userland stack) is below -->
+
+type TokenAssembler struct {
+	// This structure is designed to be embeddable.  Use Initialize when doing so.
+
+	stk    assemblerStack // this is going to end up being a stack you know
+	budget int64
+}
+
+type assemblerStackRow struct {
+	state uint8              // 0: assign this node; 1: continue list; 2: continue map with key; 3: continue map with value.
+	na    ipld.NodeAssembler // Always present.
+	la    ipld.ListAssembler // At most one of these is present.
+	ma    ipld.MapAssembler  // At most one of these is present.
+}
+type assemblerStack []assemblerStackRow
+
+func (stk assemblerStack) Tip() *assemblerStackRow {
+	return &stk[len(stk)-1]
+}
+func (stk *assemblerStack) Push(na ipld.NodeAssembler) {
+	*stk = append(*stk, assemblerStackRow{na: na})
+}
+func (stk *assemblerStack) Pop() {
+	if len(*stk) == 0 {
+		return
+	}
+	*stk = (*stk)[0 : len(*stk)-1]
+}
+
+func (ta *TokenAssembler) Initialize(na ipld.NodeAssembler, budget int64) {
+	if ta.stk == nil {
+		ta.stk = make(assemblerStack, 0, 10)
+	} else {
+		ta.stk = ta.stk[0:0]
+	}
+	ta.stk.Push(na)
+	ta.budget = budget
+}
+
+// Process takes a Token pointer as an argument.
+// (Notice how this function happens to match the definition of the visitFn that's usable as an argument to TokenWalk.)
+// The token argument can be understood to be "borrowed" for the duration of the Process call, but will not be mutated.
+// The use of a pointer here is so that a single Token can be reused by multiple calls, avoiding unnecessary allocations.
+//
+// Note that Process does very little sanity checking of token sequences itself,
+// mostly handing information to the NodeAssemblers directly,
+// which presumably will reject the data if it is out of line.
+// The NodeAssembler this TokenAssembler is wrapping should already be enforcing the relevant logical rules,
+// so it is not useful for TokenAssembler.Process to attempt to duplicate those checks;
+// TokenAssembler.Process will also return any errors from the NodeAssembler without attempting to enforce a pattern on those errors.
+// In particular, TokenAssembler.Process does not check if every MapOpen is paired with a MapClose;
+// it does not check if every ListOpen is paired with a ListClose;
+// and it does not check if the token stream is continuing after all open recursives have been closed.
+// TODO: review this documentation; more of these checks turn out necessary anyway than originally expected.
+func (ta *TokenAssembler) Process(tk *Token) (err error) {
+	if len(ta.stk) == 0 {
+		return io.EOF
+	}
+	tip := ta.stk.Tip()
+	switch tip.state {
+	case 0:
+		switch tk.Kind {
+		case TokenKind_MapOpen:
+			tip.ma, err = tip.na.BeginMap(tk.Length)
+			tip.state = 2
+			return err
+		case TokenKind_MapClose:
+			// Mostly we try to just forward things, but can't not check this one: tip.ma would be nil; there's reasonable target for forwarding.
+			return ErrMalformedTokenSequence{"map close token encountered while not in the middle of a map"}
+		case TokenKind_ListOpen:
+			tip.la, err = tip.na.BeginList(tk.Length)
+			tip.state = 1
+			return err
+		case TokenKind_ListClose:
+			// Mostly we try to just forward things, but can't not check this one: tip.la would be nil; there's reasonable target for forwarding.
+			return ErrMalformedTokenSequence{"list close token encountered while not in the middle of a list"}
+		case TokenKind_Null:
+			err = tip.na.AssignNull()
+			ta.stk.Pop()
+			return err
+		case TokenKind_Bool:
+			err = tip.na.AssignBool(tk.Bool)
+			ta.stk.Pop()
+			return err
+		case TokenKind_Int:
+			err = tip.na.AssignInt(int(tk.Int)) // TODO: upgrade all of ipld to use high precision int consistently
+			ta.stk.Pop()
+			return err
+		case TokenKind_Float:
+			err = tip.na.AssignFloat(tk.Float)
+			ta.stk.Pop()
+			return err
+		case TokenKind_String:
+			err = tip.na.AssignString(tk.Str)
+			ta.stk.Pop()
+			return err
+		case TokenKind_Bytes:
+			err = tip.na.AssignBytes(tk.Bytes)
+			ta.stk.Pop()
+			return err
+		case TokenKind_Link:
+			err = tip.na.AssignLink(tk.Link)
+			ta.stk.Pop()
+			return err
+		default:
+			panic(fmt.Errorf("unrecognized token kind (%q?)", tk.Kind))
+		}
+		return nil
+	case 1:
+		if tk.Kind == TokenKind_ListClose {
+			err = tip.la.Finish()
+			ta.stk.Pop()
+			return err
+		}
+		ta.stk.Push(tip.la.AssembleValue())
+		return ta.Process(tk)
+	case 2:
+		if tk.Kind == TokenKind_MapClose {
+			err = tip.ma.Finish()
+			ta.stk.Pop()
+			return err
+		}
+		tip.state = 3
+		ta.stk.Push(tip.ma.AssembleKey())
+		return ta.Process(tk)
+	case 3:
+		tip.state = 2
+		ta.stk.Push(tip.ma.AssembleValue())
+		return ta.Process(tk)
+	default:
+		panic("unreachable")
+	}
+}