Skip to content

Commit

Permalink
Merge pull request #101 from ipld/codectools-tokenizers
Browse files Browse the repository at this point in the history
Fresh take on codec APIs, and some tokenization utilities.
  • Loading branch information
warpfork authored Nov 14, 2020
2 parents 35ad3e3 + 1110155 commit 624fae0
Show file tree
Hide file tree
Showing 8 changed files with 873 additions and 0 deletions.
50 changes: 50 additions & 0 deletions codec/api.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package codec

import (
"io"

"github.com/ipld/go-ipld-prime"
)

// Encoder is the essential definition of a function that takes IPLD Data Model data in memory and serializes it.
// IPLD Codecs are written by implementing this function interface (as well as (typically) a matched Decoder).
//
// Encoder functions can be composed into an ipld.LinkSystem to provide
// a "one stop shop" API for handling content addressable storage.
// Encoder functions can also be used directly if you want to handle serial data streams.
//
// Most codec packages will have a ReusableEncoder type
// (which contains any working memory needed by the encoder implementation,
// as well as any configuration options),
// and that type will have an Encode function matching this interface.
//
// By convention, codec packages that have a multicodec contract will also have
// a package-scope exported function called Encode which also matches this interface,
// and is the equivalent of creating a zero-value ReusableEncoder (aka, default config)
// and using its Encode method.
// This package-scope function will typically also internally use a sync.Pool
// to keep some ReusableEncoder values on hand to avoid unnecesary allocations.
//
// Note that a ReusableEncoder type that supports configuration options
// does not functionally expose those options when invoked by the multicodec system --
// multicodec indicators do not provide room for extended configuration info.
// Codecs that expose configuration options are doing so for library users to enjoy;
// it does not mean those non-default configurations will necessarly be available
// in all scenarios that use codecs indirectly.
// There is also no standard interface for such configurations: by nature,
// if they exist at all, they vary per codec.
type Encoder func(data ipld.Node, output io.Writer) error

// Decoder is the essential definiton of a function that consumes serial data and unfurls it into IPLD Data Model-compatible in-memory representations.
// IPLD Codecs are written by implementing this function interface (as well as (typically) a matched Encoder).
//
// Decoder is the dual of Encoder.
// Most of the documentation for the Encoder function interface
// also applies wholesale to the Decoder interface.
type Decoder func(into ipld.NodeAssembler, input io.Reader) error

type ErrBudgetExhausted struct{}

func (e ErrBudgetExhausted) Error() string {
return "decoder resource budget exhausted (message too long or too complex)"
}
84 changes: 84 additions & 0 deletions codec/codectools/token.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
package codectools

import (
"fmt"

"github.com/ipld/go-ipld-prime"
)

type Token struct {
Kind TokenKind

Length int // Present for MapOpen or ListOpen. May be -1 for "unknown" (e.g. a json tokenizer will yield this).
Bool bool // Value. Union: only has meaning if Kind is TokenKind_Bool.
Int int64 // Value. Union: only has meaning if Kind is TokenKind_Int.
Float float64 // Value. Union: only has meaning if Kind is TokenKind_Float.
Str string // Value. Union: only has meaning if Kind is TokenKind_String. ('Str' rather than 'String' to avoid collision with method.)
Bytes []byte // Value. Union: only has meaning if Kind is TokenKind_Bytes.
Link ipld.Link // Value. Union: only has meaning if Kind is TokenKind_Link.

Node ipld.Node // Direct pointer to the original data, if this token is used to communicate data during a walk of existing in-memory data. Absent when token is being used during deserialization.

// The following fields all track position and progress:
// (These may be useful to copy into any error messages if errors arise.)
// (Implementations may assume token reuse and treat these as state keeping;
// you may experience position accounting accuracy problems if *not* reusing tokens or if zeroing these fields.)

pth []ipld.PathSegment // Set by token producers (whether marshallers or deserializers) to track logical position.
offset int64 // Set by deserializers (for both textual or binary formats alike) to track progress.
lineOffset int64 // Set by deserializers that work with textual data. May be ignored by binary deserializers.
columnOffset int64 // Set by deserializers that work with textual data. May be ignored by binary deserializers.
}

func (tk Token) String() string {
switch tk.Kind {
case TokenKind_MapOpen:
return fmt.Sprintf("<%c:%d>", tk.Kind, tk.Length)
case TokenKind_MapClose:
return fmt.Sprintf("<%c>", tk.Kind)
case TokenKind_ListOpen:
return fmt.Sprintf("<%c:%d>", tk.Kind, tk.Length)
case TokenKind_ListClose:
return fmt.Sprintf("<%c>", tk.Kind)
case TokenKind_Null:
return fmt.Sprintf("<%c>", tk.Kind)
case TokenKind_Bool:
return fmt.Sprintf("<%c:%v>", tk.Kind, tk.Bool)
case TokenKind_Int:
return fmt.Sprintf("<%c:%v>", tk.Kind, tk.Int)
case TokenKind_Float:
return fmt.Sprintf("<%c:%v>", tk.Kind, tk.Float)
case TokenKind_String:
return fmt.Sprintf("<%c:%q>", tk.Kind, tk.Str)
case TokenKind_Bytes:
return fmt.Sprintf("<%c:%x>", tk.Kind, tk.Bytes)
case TokenKind_Link:
return fmt.Sprintf("<%c:%v>", tk.Kind, tk.Link)
default:
return "<INVALID>"
}
}

type TokenKind uint8

const (
TokenKind_MapOpen TokenKind = '{'
TokenKind_MapClose TokenKind = '}'
TokenKind_ListOpen TokenKind = '['
TokenKind_ListClose TokenKind = ']'
TokenKind_Null TokenKind = '0'
TokenKind_Bool TokenKind = 'b'
TokenKind_Int TokenKind = 'i'
TokenKind_Float TokenKind = 'f'
TokenKind_String TokenKind = 's'
TokenKind_Bytes TokenKind = 'x'
TokenKind_Link TokenKind = '/'
)

type ErrMalformedTokenSequence struct {
Detail string
}

func (e ErrMalformedTokenSequence) Error() string {
return "malformed token sequence: " + e.Detail
}
258 changes: 258 additions & 0 deletions codec/codectools/token_consumers.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,258 @@
package codectools

import (
"fmt"
"io"

"github.com/ipld/go-ipld-prime"
"github.com/ipld/go-ipld-prime/codec"
)

// TokenAssemble takes an ipld.NodeAssembler and a TokenReader,
// and repeatedly pumps the TokenReader for tokens and feeds their data into the ipld.NodeAssembler
// until it finishes a complete value.
//
// To compare and contrast to other token oriented tools:
// TokenAssemble does the same direction of information transfer as the TokenAssembler gadget does,
// but TokenAssemble moves completely through a value in one step,
// whereas the TokenAssembler accepts tokens pumped into it one step at a time.
//
// TokenAssemble does not enforce the "map keys must be strings" rule which is present in the Data Model;
// it will also happily do even recursive structures in map keys,
// meaning it can be used when handling schema values like maps with complex keys.
func TokenAssemble(na ipld.NodeAssembler, tr TokenReader, budget int) error {
tk, err := tr(&budget)
if err != nil {
return err
}
return tokenAssemble(na, tk, tr, &budget)
}

func tokenAssemble(na ipld.NodeAssembler, tk *Token, tr TokenReader, budget *int) error {
if *budget < 0 {
return codec.ErrBudgetExhausted{}
}
switch tk.Kind {
case TokenKind_MapOpen:
if tk.Length > 0 && *budget < tk.Length*2 { // Pre-check budget: at least two decrements estimated for each entry.
return codec.ErrBudgetExhausted{}
}
ma, err := na.BeginMap(tk.Length)
if err != nil {
return err
}
for {
// Peek one token. We need to see if the map is about to end or not.
tk, err = tr(budget)
if err != nil {
return err
}
// If the map has ended, invoke the finish operation and check for any errors.
if tk.Kind == TokenKind_MapClose {
return ma.Finish()
}
// Recurse to assemble the key.
*budget-- // Decrement budget by at least one for each key. The key content may also cause further decrements.
if err = tokenAssemble(ma.AssembleKey(), tk, tr, budget); err != nil {
return err
}
// Recurse to assemble the value.
// (We don't really care to peek this token, but do so anyway to keep the calling convention regular.)
tk, err = tr(budget)
if err != nil {
return err
}
*budget-- // Decrement budget by at least one for each value. The value content may also cause further decrements.
if err = tokenAssemble(ma.AssembleValue(), tk, tr, budget); err != nil {
return err
}
// Continue around the loop, to encounter either the next entry or the end of the map.
}
case TokenKind_MapClose:
return ErrMalformedTokenSequence{"map close token encountered while not in the middle of a map"}
case TokenKind_ListOpen:
if tk.Length > 0 && *budget < tk.Length { // Pre-check budget: at least one decrement estimated for each entry.
return codec.ErrBudgetExhausted{}
}
la, err := na.BeginList(tk.Length)
if err != nil {
return err
}
for {
// Peek one token. We need to see if the list is about to end or not.
tk, err = tr(budget)
if err != nil {
return err
}
// If the list has ended, invoke the finish operation and check for any errors.
if tk.Kind == TokenKind_ListClose {
return la.Finish()
}
// Recurse to assemble the value.
*budget-- // Decrement budget by at least one for each value. The value content may also cause further decrements.
if err = tokenAssemble(la.AssembleValue(), tk, tr, budget); err != nil {
return err
}
// Continue around the loop, to encounter either the next value or the end of the list.
}
case TokenKind_ListClose:
return ErrMalformedTokenSequence{"list close token encountered while not in the middle of a list"}
case TokenKind_Null:
return na.AssignNull()
case TokenKind_Bool:
*budget--
return na.AssignBool(tk.Bool)
case TokenKind_Int:
*budget--
return na.AssignInt(int(tk.Int))
case TokenKind_Float:
*budget--
return na.AssignFloat(tk.Float)
case TokenKind_String:
*budget -= len(tk.Str)
return na.AssignString(tk.Str)
case TokenKind_Bytes:
*budget -= len(tk.Bytes)
return na.AssignBytes(tk.Bytes)
case TokenKind_Link:
*budget--
return na.AssignLink(tk.Link)
default:
panic(fmt.Errorf("unrecognized token kind (%q?)", tk.Kind))
}
}

// --- the stepwise assembler system (more complicated; has a userland stack) is below -->

type TokenAssembler struct {
// This structure is designed to be embeddable. Use Initialize when doing so.

stk assemblerStack // this is going to end up being a stack you know
budget int64
}

type assemblerStackRow struct {
state uint8 // 0: assign this node; 1: continue list; 2: continue map with key; 3: continue map with value.
na ipld.NodeAssembler // Always present.
la ipld.ListAssembler // At most one of these is present.
ma ipld.MapAssembler // At most one of these is present.
}
type assemblerStack []assemblerStackRow

func (stk assemblerStack) Tip() *assemblerStackRow {
return &stk[len(stk)-1]
}
func (stk *assemblerStack) Push(na ipld.NodeAssembler) {
*stk = append(*stk, assemblerStackRow{na: na})
}
func (stk *assemblerStack) Pop() {
if len(*stk) == 0 {
return
}
*stk = (*stk)[0 : len(*stk)-1]
}

func (ta *TokenAssembler) Initialize(na ipld.NodeAssembler, budget int64) {
if ta.stk == nil {
ta.stk = make(assemblerStack, 0, 10)
} else {
ta.stk = ta.stk[0:0]
}
ta.stk.Push(na)
ta.budget = budget
}

// Process takes a Token pointer as an argument.
// (Notice how this function happens to match the definition of the visitFn that's usable as an argument to TokenWalk.)
// The token argument can be understood to be "borrowed" for the duration of the Process call, but will not be mutated.
// The use of a pointer here is so that a single Token can be reused by multiple calls, avoiding unnecessary allocations.
//
// Note that Process does very little sanity checking of token sequences itself,
// mostly handing information to the NodeAssemblers directly,
// which presumably will reject the data if it is out of line.
// The NodeAssembler this TokenAssembler is wrapping should already be enforcing the relevant logical rules,
// so it is not useful for TokenAssembler.Process to attempt to duplicate those checks;
// TokenAssembler.Process will also return any errors from the NodeAssembler without attempting to enforce a pattern on those errors.
// In particular, TokenAssembler.Process does not check if every MapOpen is paired with a MapClose;
// it does not check if every ListOpen is paired with a ListClose;
// and it does not check if the token stream is continuing after all open recursives have been closed.
// TODO: review this documentation; more of these checks turn out necessary anyway than originally expected.
func (ta *TokenAssembler) Process(tk *Token) (err error) {
if len(ta.stk) == 0 {
return io.EOF
}
tip := ta.stk.Tip()
switch tip.state {
case 0:
switch tk.Kind {
case TokenKind_MapOpen:
tip.ma, err = tip.na.BeginMap(tk.Length)
tip.state = 2
return err
case TokenKind_MapClose:
// Mostly we try to just forward things, but can't not check this one: tip.ma would be nil; there's reasonable target for forwarding.
return ErrMalformedTokenSequence{"map close token encountered while not in the middle of a map"}
case TokenKind_ListOpen:
tip.la, err = tip.na.BeginList(tk.Length)
tip.state = 1
return err
case TokenKind_ListClose:
// Mostly we try to just forward things, but can't not check this one: tip.la would be nil; there's reasonable target for forwarding.
return ErrMalformedTokenSequence{"list close token encountered while not in the middle of a list"}
case TokenKind_Null:
err = tip.na.AssignNull()
ta.stk.Pop()
return err
case TokenKind_Bool:
err = tip.na.AssignBool(tk.Bool)
ta.stk.Pop()
return err
case TokenKind_Int:
err = tip.na.AssignInt(int(tk.Int)) // TODO: upgrade all of ipld to use high precision int consistently
ta.stk.Pop()
return err
case TokenKind_Float:
err = tip.na.AssignFloat(tk.Float)
ta.stk.Pop()
return err
case TokenKind_String:
err = tip.na.AssignString(tk.Str)
ta.stk.Pop()
return err
case TokenKind_Bytes:
err = tip.na.AssignBytes(tk.Bytes)
ta.stk.Pop()
return err
case TokenKind_Link:
err = tip.na.AssignLink(tk.Link)
ta.stk.Pop()
return err
default:
panic(fmt.Errorf("unrecognized token kind (%q?)", tk.Kind))
}
return nil
case 1:
if tk.Kind == TokenKind_ListClose {
err = tip.la.Finish()
ta.stk.Pop()
return err
}
ta.stk.Push(tip.la.AssembleValue())
return ta.Process(tk)
case 2:
if tk.Kind == TokenKind_MapClose {
err = tip.ma.Finish()
ta.stk.Pop()
return err
}
tip.state = 3
ta.stk.Push(tip.ma.AssembleKey())
return ta.Process(tk)
case 3:
tip.state = 2
ta.stk.Push(tip.ma.AssembleValue())
return ta.Process(tk)
default:
panic("unreachable")
}
}
Loading

0 comments on commit 624fae0

Please sign in to comment.