-
Notifications
You must be signed in to change notification settings - Fork 50
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #101 from ipld/codectools-tokenizers
Fresh take on codec APIs, and some tokenization utilities.
- Loading branch information
Showing
8 changed files
with
873 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
package codec | ||
|
||
import ( | ||
"io" | ||
|
||
"github.com/ipld/go-ipld-prime" | ||
) | ||
|
||
// Encoder is the essential definition of a function that takes IPLD Data Model data in memory and serializes it. | ||
// IPLD Codecs are written by implementing this function interface (as well as (typically) a matched Decoder). | ||
// | ||
// Encoder functions can be composed into an ipld.LinkSystem to provide | ||
// a "one stop shop" API for handling content addressable storage. | ||
// Encoder functions can also be used directly if you want to handle serial data streams. | ||
// | ||
// Most codec packages will have a ReusableEncoder type | ||
// (which contains any working memory needed by the encoder implementation, | ||
// as well as any configuration options), | ||
// and that type will have an Encode function matching this interface. | ||
// | ||
// By convention, codec packages that have a multicodec contract will also have | ||
// a package-scope exported function called Encode which also matches this interface, | ||
// and is the equivalent of creating a zero-value ReusableEncoder (aka, default config) | ||
// and using its Encode method. | ||
// This package-scope function will typically also internally use a sync.Pool | ||
// to keep some ReusableEncoder values on hand to avoid unnecesary allocations. | ||
// | ||
// Note that a ReusableEncoder type that supports configuration options | ||
// does not functionally expose those options when invoked by the multicodec system -- | ||
// multicodec indicators do not provide room for extended configuration info. | ||
// Codecs that expose configuration options are doing so for library users to enjoy; | ||
// it does not mean those non-default configurations will necessarly be available | ||
// in all scenarios that use codecs indirectly. | ||
// There is also no standard interface for such configurations: by nature, | ||
// if they exist at all, they vary per codec. | ||
type Encoder func(data ipld.Node, output io.Writer) error | ||
|
||
// Decoder is the essential definiton of a function that consumes serial data and unfurls it into IPLD Data Model-compatible in-memory representations. | ||
// IPLD Codecs are written by implementing this function interface (as well as (typically) a matched Encoder). | ||
// | ||
// Decoder is the dual of Encoder. | ||
// Most of the documentation for the Encoder function interface | ||
// also applies wholesale to the Decoder interface. | ||
type Decoder func(into ipld.NodeAssembler, input io.Reader) error | ||
|
||
type ErrBudgetExhausted struct{} | ||
|
||
func (e ErrBudgetExhausted) Error() string { | ||
return "decoder resource budget exhausted (message too long or too complex)" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
package codectools | ||
|
||
import ( | ||
"fmt" | ||
|
||
"github.com/ipld/go-ipld-prime" | ||
) | ||
|
||
type Token struct { | ||
Kind TokenKind | ||
|
||
Length int // Present for MapOpen or ListOpen. May be -1 for "unknown" (e.g. a json tokenizer will yield this). | ||
Bool bool // Value. Union: only has meaning if Kind is TokenKind_Bool. | ||
Int int64 // Value. Union: only has meaning if Kind is TokenKind_Int. | ||
Float float64 // Value. Union: only has meaning if Kind is TokenKind_Float. | ||
Str string // Value. Union: only has meaning if Kind is TokenKind_String. ('Str' rather than 'String' to avoid collision with method.) | ||
Bytes []byte // Value. Union: only has meaning if Kind is TokenKind_Bytes. | ||
Link ipld.Link // Value. Union: only has meaning if Kind is TokenKind_Link. | ||
|
||
Node ipld.Node // Direct pointer to the original data, if this token is used to communicate data during a walk of existing in-memory data. Absent when token is being used during deserialization. | ||
|
||
// The following fields all track position and progress: | ||
// (These may be useful to copy into any error messages if errors arise.) | ||
// (Implementations may assume token reuse and treat these as state keeping; | ||
// you may experience position accounting accuracy problems if *not* reusing tokens or if zeroing these fields.) | ||
|
||
pth []ipld.PathSegment // Set by token producers (whether marshallers or deserializers) to track logical position. | ||
offset int64 // Set by deserializers (for both textual or binary formats alike) to track progress. | ||
lineOffset int64 // Set by deserializers that work with textual data. May be ignored by binary deserializers. | ||
columnOffset int64 // Set by deserializers that work with textual data. May be ignored by binary deserializers. | ||
} | ||
|
||
func (tk Token) String() string { | ||
switch tk.Kind { | ||
case TokenKind_MapOpen: | ||
return fmt.Sprintf("<%c:%d>", tk.Kind, tk.Length) | ||
case TokenKind_MapClose: | ||
return fmt.Sprintf("<%c>", tk.Kind) | ||
case TokenKind_ListOpen: | ||
return fmt.Sprintf("<%c:%d>", tk.Kind, tk.Length) | ||
case TokenKind_ListClose: | ||
return fmt.Sprintf("<%c>", tk.Kind) | ||
case TokenKind_Null: | ||
return fmt.Sprintf("<%c>", tk.Kind) | ||
case TokenKind_Bool: | ||
return fmt.Sprintf("<%c:%v>", tk.Kind, tk.Bool) | ||
case TokenKind_Int: | ||
return fmt.Sprintf("<%c:%v>", tk.Kind, tk.Int) | ||
case TokenKind_Float: | ||
return fmt.Sprintf("<%c:%v>", tk.Kind, tk.Float) | ||
case TokenKind_String: | ||
return fmt.Sprintf("<%c:%q>", tk.Kind, tk.Str) | ||
case TokenKind_Bytes: | ||
return fmt.Sprintf("<%c:%x>", tk.Kind, tk.Bytes) | ||
case TokenKind_Link: | ||
return fmt.Sprintf("<%c:%v>", tk.Kind, tk.Link) | ||
default: | ||
return "<INVALID>" | ||
} | ||
} | ||
|
||
type TokenKind uint8 | ||
|
||
const ( | ||
TokenKind_MapOpen TokenKind = '{' | ||
TokenKind_MapClose TokenKind = '}' | ||
TokenKind_ListOpen TokenKind = '[' | ||
TokenKind_ListClose TokenKind = ']' | ||
TokenKind_Null TokenKind = '0' | ||
TokenKind_Bool TokenKind = 'b' | ||
TokenKind_Int TokenKind = 'i' | ||
TokenKind_Float TokenKind = 'f' | ||
TokenKind_String TokenKind = 's' | ||
TokenKind_Bytes TokenKind = 'x' | ||
TokenKind_Link TokenKind = '/' | ||
) | ||
|
||
type ErrMalformedTokenSequence struct { | ||
Detail string | ||
} | ||
|
||
func (e ErrMalformedTokenSequence) Error() string { | ||
return "malformed token sequence: " + e.Detail | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,258 @@ | ||
package codectools | ||
|
||
import ( | ||
"fmt" | ||
"io" | ||
|
||
"github.com/ipld/go-ipld-prime" | ||
"github.com/ipld/go-ipld-prime/codec" | ||
) | ||
|
||
// TokenAssemble takes an ipld.NodeAssembler and a TokenReader, | ||
// and repeatedly pumps the TokenReader for tokens and feeds their data into the ipld.NodeAssembler | ||
// until it finishes a complete value. | ||
// | ||
// To compare and contrast to other token oriented tools: | ||
// TokenAssemble does the same direction of information transfer as the TokenAssembler gadget does, | ||
// but TokenAssemble moves completely through a value in one step, | ||
// whereas the TokenAssembler accepts tokens pumped into it one step at a time. | ||
// | ||
// TokenAssemble does not enforce the "map keys must be strings" rule which is present in the Data Model; | ||
// it will also happily do even recursive structures in map keys, | ||
// meaning it can be used when handling schema values like maps with complex keys. | ||
func TokenAssemble(na ipld.NodeAssembler, tr TokenReader, budget int) error { | ||
tk, err := tr(&budget) | ||
if err != nil { | ||
return err | ||
} | ||
return tokenAssemble(na, tk, tr, &budget) | ||
} | ||
|
||
func tokenAssemble(na ipld.NodeAssembler, tk *Token, tr TokenReader, budget *int) error { | ||
if *budget < 0 { | ||
return codec.ErrBudgetExhausted{} | ||
} | ||
switch tk.Kind { | ||
case TokenKind_MapOpen: | ||
if tk.Length > 0 && *budget < tk.Length*2 { // Pre-check budget: at least two decrements estimated for each entry. | ||
return codec.ErrBudgetExhausted{} | ||
} | ||
ma, err := na.BeginMap(tk.Length) | ||
if err != nil { | ||
return err | ||
} | ||
for { | ||
// Peek one token. We need to see if the map is about to end or not. | ||
tk, err = tr(budget) | ||
if err != nil { | ||
return err | ||
} | ||
// If the map has ended, invoke the finish operation and check for any errors. | ||
if tk.Kind == TokenKind_MapClose { | ||
return ma.Finish() | ||
} | ||
// Recurse to assemble the key. | ||
*budget-- // Decrement budget by at least one for each key. The key content may also cause further decrements. | ||
if err = tokenAssemble(ma.AssembleKey(), tk, tr, budget); err != nil { | ||
return err | ||
} | ||
// Recurse to assemble the value. | ||
// (We don't really care to peek this token, but do so anyway to keep the calling convention regular.) | ||
tk, err = tr(budget) | ||
if err != nil { | ||
return err | ||
} | ||
*budget-- // Decrement budget by at least one for each value. The value content may also cause further decrements. | ||
if err = tokenAssemble(ma.AssembleValue(), tk, tr, budget); err != nil { | ||
return err | ||
} | ||
// Continue around the loop, to encounter either the next entry or the end of the map. | ||
} | ||
case TokenKind_MapClose: | ||
return ErrMalformedTokenSequence{"map close token encountered while not in the middle of a map"} | ||
case TokenKind_ListOpen: | ||
if tk.Length > 0 && *budget < tk.Length { // Pre-check budget: at least one decrement estimated for each entry. | ||
return codec.ErrBudgetExhausted{} | ||
} | ||
la, err := na.BeginList(tk.Length) | ||
if err != nil { | ||
return err | ||
} | ||
for { | ||
// Peek one token. We need to see if the list is about to end or not. | ||
tk, err = tr(budget) | ||
if err != nil { | ||
return err | ||
} | ||
// If the list has ended, invoke the finish operation and check for any errors. | ||
if tk.Kind == TokenKind_ListClose { | ||
return la.Finish() | ||
} | ||
// Recurse to assemble the value. | ||
*budget-- // Decrement budget by at least one for each value. The value content may also cause further decrements. | ||
if err = tokenAssemble(la.AssembleValue(), tk, tr, budget); err != nil { | ||
return err | ||
} | ||
// Continue around the loop, to encounter either the next value or the end of the list. | ||
} | ||
case TokenKind_ListClose: | ||
return ErrMalformedTokenSequence{"list close token encountered while not in the middle of a list"} | ||
case TokenKind_Null: | ||
return na.AssignNull() | ||
case TokenKind_Bool: | ||
*budget-- | ||
return na.AssignBool(tk.Bool) | ||
case TokenKind_Int: | ||
*budget-- | ||
return na.AssignInt(int(tk.Int)) | ||
case TokenKind_Float: | ||
*budget-- | ||
return na.AssignFloat(tk.Float) | ||
case TokenKind_String: | ||
*budget -= len(tk.Str) | ||
return na.AssignString(tk.Str) | ||
case TokenKind_Bytes: | ||
*budget -= len(tk.Bytes) | ||
return na.AssignBytes(tk.Bytes) | ||
case TokenKind_Link: | ||
*budget-- | ||
return na.AssignLink(tk.Link) | ||
default: | ||
panic(fmt.Errorf("unrecognized token kind (%q?)", tk.Kind)) | ||
} | ||
} | ||
|
||
// --- the stepwise assembler system (more complicated; has a userland stack) is below --> | ||
|
||
type TokenAssembler struct { | ||
// This structure is designed to be embeddable. Use Initialize when doing so. | ||
|
||
stk assemblerStack // this is going to end up being a stack you know | ||
budget int64 | ||
} | ||
|
||
type assemblerStackRow struct { | ||
state uint8 // 0: assign this node; 1: continue list; 2: continue map with key; 3: continue map with value. | ||
na ipld.NodeAssembler // Always present. | ||
la ipld.ListAssembler // At most one of these is present. | ||
ma ipld.MapAssembler // At most one of these is present. | ||
} | ||
type assemblerStack []assemblerStackRow | ||
|
||
func (stk assemblerStack) Tip() *assemblerStackRow { | ||
return &stk[len(stk)-1] | ||
} | ||
func (stk *assemblerStack) Push(na ipld.NodeAssembler) { | ||
*stk = append(*stk, assemblerStackRow{na: na}) | ||
} | ||
func (stk *assemblerStack) Pop() { | ||
if len(*stk) == 0 { | ||
return | ||
} | ||
*stk = (*stk)[0 : len(*stk)-1] | ||
} | ||
|
||
func (ta *TokenAssembler) Initialize(na ipld.NodeAssembler, budget int64) { | ||
if ta.stk == nil { | ||
ta.stk = make(assemblerStack, 0, 10) | ||
} else { | ||
ta.stk = ta.stk[0:0] | ||
} | ||
ta.stk.Push(na) | ||
ta.budget = budget | ||
} | ||
|
||
// Process takes a Token pointer as an argument. | ||
// (Notice how this function happens to match the definition of the visitFn that's usable as an argument to TokenWalk.) | ||
// The token argument can be understood to be "borrowed" for the duration of the Process call, but will not be mutated. | ||
// The use of a pointer here is so that a single Token can be reused by multiple calls, avoiding unnecessary allocations. | ||
// | ||
// Note that Process does very little sanity checking of token sequences itself, | ||
// mostly handing information to the NodeAssemblers directly, | ||
// which presumably will reject the data if it is out of line. | ||
// The NodeAssembler this TokenAssembler is wrapping should already be enforcing the relevant logical rules, | ||
// so it is not useful for TokenAssembler.Process to attempt to duplicate those checks; | ||
// TokenAssembler.Process will also return any errors from the NodeAssembler without attempting to enforce a pattern on those errors. | ||
// In particular, TokenAssembler.Process does not check if every MapOpen is paired with a MapClose; | ||
// it does not check if every ListOpen is paired with a ListClose; | ||
// and it does not check if the token stream is continuing after all open recursives have been closed. | ||
// TODO: review this documentation; more of these checks turn out necessary anyway than originally expected. | ||
func (ta *TokenAssembler) Process(tk *Token) (err error) { | ||
if len(ta.stk) == 0 { | ||
return io.EOF | ||
} | ||
tip := ta.stk.Tip() | ||
switch tip.state { | ||
case 0: | ||
switch tk.Kind { | ||
case TokenKind_MapOpen: | ||
tip.ma, err = tip.na.BeginMap(tk.Length) | ||
tip.state = 2 | ||
return err | ||
case TokenKind_MapClose: | ||
// Mostly we try to just forward things, but can't not check this one: tip.ma would be nil; there's reasonable target for forwarding. | ||
return ErrMalformedTokenSequence{"map close token encountered while not in the middle of a map"} | ||
case TokenKind_ListOpen: | ||
tip.la, err = tip.na.BeginList(tk.Length) | ||
tip.state = 1 | ||
return err | ||
case TokenKind_ListClose: | ||
// Mostly we try to just forward things, but can't not check this one: tip.la would be nil; there's reasonable target for forwarding. | ||
return ErrMalformedTokenSequence{"list close token encountered while not in the middle of a list"} | ||
case TokenKind_Null: | ||
err = tip.na.AssignNull() | ||
ta.stk.Pop() | ||
return err | ||
case TokenKind_Bool: | ||
err = tip.na.AssignBool(tk.Bool) | ||
ta.stk.Pop() | ||
return err | ||
case TokenKind_Int: | ||
err = tip.na.AssignInt(int(tk.Int)) // TODO: upgrade all of ipld to use high precision int consistently | ||
ta.stk.Pop() | ||
return err | ||
case TokenKind_Float: | ||
err = tip.na.AssignFloat(tk.Float) | ||
ta.stk.Pop() | ||
return err | ||
case TokenKind_String: | ||
err = tip.na.AssignString(tk.Str) | ||
ta.stk.Pop() | ||
return err | ||
case TokenKind_Bytes: | ||
err = tip.na.AssignBytes(tk.Bytes) | ||
ta.stk.Pop() | ||
return err | ||
case TokenKind_Link: | ||
err = tip.na.AssignLink(tk.Link) | ||
ta.stk.Pop() | ||
return err | ||
default: | ||
panic(fmt.Errorf("unrecognized token kind (%q?)", tk.Kind)) | ||
} | ||
return nil | ||
case 1: | ||
if tk.Kind == TokenKind_ListClose { | ||
err = tip.la.Finish() | ||
ta.stk.Pop() | ||
return err | ||
} | ||
ta.stk.Push(tip.la.AssembleValue()) | ||
return ta.Process(tk) | ||
case 2: | ||
if tk.Kind == TokenKind_MapClose { | ||
err = tip.ma.Finish() | ||
ta.stk.Pop() | ||
return err | ||
} | ||
tip.state = 3 | ||
ta.stk.Push(tip.ma.AssembleKey()) | ||
return ta.Process(tk) | ||
case 3: | ||
tip.state = 2 | ||
ta.stk.Push(tip.ma.AssembleValue()) | ||
return ta.Process(tk) | ||
default: | ||
panic("unreachable") | ||
} | ||
} |
Oops, something went wrong.