diff --git a/codec/api.go b/codec/api.go new file mode 100644 index 00000000..511b08de --- /dev/null +++ b/codec/api.go @@ -0,0 +1,50 @@ +package codec + +import ( + "io" + + "github.com/ipld/go-ipld-prime" +) + +// Encoder is the essential definition of a function that takes IPLD Data Model data in memory and serializes it. +// IPLD Codecs are written by implementing this function interface (as well as (typically) a matched Decoder). +// +// Encoder functions can be composed into an ipld.LinkSystem to provide +// a "one stop shop" API for handling content addressable storage. +// Encoder functions can also be used directly if you want to handle serial data streams. +// +// Most codec packages will have a ReusableEncoder type +// (which contains any working memory needed by the encoder implementation, +// as well as any configuration options), +// and that type will have an Encode function matching this interface. +// +// By convention, codec packages that have a multicodec contract will also have +// a package-scope exported function called Encode which also matches this interface, +// and is the equivalent of creating a zero-value ReusableEncoder (aka, default config) +// and using its Encode method. +// This package-scope function will typically also internally use a sync.Pool +// to keep some ReusableEncoder values on hand to avoid unnecesary allocations. +// +// Note that a ReusableEncoder type that supports configuration options +// does not functionally expose those options when invoked by the multicodec system -- +// multicodec indicators do not provide room for extended configuration info. +// Codecs that expose configuration options are doing so for library users to enjoy; +// it does not mean those non-default configurations will necessarly be available +// in all scenarios that use codecs indirectly. +// There is also no standard interface for such configurations: by nature, +// if they exist at all, they vary per codec. +type Encoder func(data ipld.Node, output io.Writer) error + +// Decoder is the essential definiton of a function that consumes serial data and unfurls it into IPLD Data Model-compatible in-memory representations. +// IPLD Codecs are written by implementing this function interface (as well as (typically) a matched Encoder). +// +// Decoder is the dual of Encoder. +// Most of the documentation for the Encoder function interface +// also applies wholesale to the Decoder interface. +type Decoder func(into ipld.NodeAssembler, input io.Reader) error + +type ErrBudgetExhausted struct{} + +func (e ErrBudgetExhausted) Error() string { + return "decoder resource budget exhausted (message too long or too complex)" +} diff --git a/codec/codectools/token.go b/codec/codectools/token.go new file mode 100644 index 00000000..9b771c86 --- /dev/null +++ b/codec/codectools/token.go @@ -0,0 +1,84 @@ +package codectools + +import ( + "fmt" + + "github.com/ipld/go-ipld-prime" +) + +type Token struct { + Kind TokenKind + + Length int // Present for MapOpen or ListOpen. May be -1 for "unknown" (e.g. a json tokenizer will yield this). + Bool bool // Value. Union: only has meaning if Kind is TokenKind_Bool. + Int int64 // Value. Union: only has meaning if Kind is TokenKind_Int. + Float float64 // Value. Union: only has meaning if Kind is TokenKind_Float. + Str string // Value. Union: only has meaning if Kind is TokenKind_String. ('Str' rather than 'String' to avoid collision with method.) + Bytes []byte // Value. Union: only has meaning if Kind is TokenKind_Bytes. + Link ipld.Link // Value. Union: only has meaning if Kind is TokenKind_Link. + + Node ipld.Node // Direct pointer to the original data, if this token is used to communicate data during a walk of existing in-memory data. Absent when token is being used during deserialization. + + // The following fields all track position and progress: + // (These may be useful to copy into any error messages if errors arise.) + // (Implementations may assume token reuse and treat these as state keeping; + // you may experience position accounting accuracy problems if *not* reusing tokens or if zeroing these fields.) + + pth []ipld.PathSegment // Set by token producers (whether marshallers or deserializers) to track logical position. + offset int64 // Set by deserializers (for both textual or binary formats alike) to track progress. + lineOffset int64 // Set by deserializers that work with textual data. May be ignored by binary deserializers. + columnOffset int64 // Set by deserializers that work with textual data. May be ignored by binary deserializers. +} + +func (tk Token) String() string { + switch tk.Kind { + case TokenKind_MapOpen: + return fmt.Sprintf("<%c:%d>", tk.Kind, tk.Length) + case TokenKind_MapClose: + return fmt.Sprintf("<%c>", tk.Kind) + case TokenKind_ListOpen: + return fmt.Sprintf("<%c:%d>", tk.Kind, tk.Length) + case TokenKind_ListClose: + return fmt.Sprintf("<%c>", tk.Kind) + case TokenKind_Null: + return fmt.Sprintf("<%c>", tk.Kind) + case TokenKind_Bool: + return fmt.Sprintf("<%c:%v>", tk.Kind, tk.Bool) + case TokenKind_Int: + return fmt.Sprintf("<%c:%v>", tk.Kind, tk.Int) + case TokenKind_Float: + return fmt.Sprintf("<%c:%v>", tk.Kind, tk.Float) + case TokenKind_String: + return fmt.Sprintf("<%c:%q>", tk.Kind, tk.Str) + case TokenKind_Bytes: + return fmt.Sprintf("<%c:%x>", tk.Kind, tk.Bytes) + case TokenKind_Link: + return fmt.Sprintf("<%c:%v>", tk.Kind, tk.Link) + default: + return "" + } +} + +type TokenKind uint8 + +const ( + TokenKind_MapOpen TokenKind = '{' + TokenKind_MapClose TokenKind = '}' + TokenKind_ListOpen TokenKind = '[' + TokenKind_ListClose TokenKind = ']' + TokenKind_Null TokenKind = '0' + TokenKind_Bool TokenKind = 'b' + TokenKind_Int TokenKind = 'i' + TokenKind_Float TokenKind = 'f' + TokenKind_String TokenKind = 's' + TokenKind_Bytes TokenKind = 'x' + TokenKind_Link TokenKind = '/' +) + +type ErrMalformedTokenSequence struct { + Detail string +} + +func (e ErrMalformedTokenSequence) Error() string { + return "malformed token sequence: " + e.Detail +} diff --git a/codec/codectools/token_consumers.go b/codec/codectools/token_consumers.go new file mode 100644 index 00000000..3ad2734d --- /dev/null +++ b/codec/codectools/token_consumers.go @@ -0,0 +1,258 @@ +package codectools + +import ( + "fmt" + "io" + + "github.com/ipld/go-ipld-prime" + "github.com/ipld/go-ipld-prime/codec" +) + +// TokenAssemble takes an ipld.NodeAssembler and a TokenReader, +// and repeatedly pumps the TokenReader for tokens and feeds their data into the ipld.NodeAssembler +// until it finishes a complete value. +// +// To compare and contrast to other token oriented tools: +// TokenAssemble does the same direction of information transfer as the TokenAssembler gadget does, +// but TokenAssemble moves completely through a value in one step, +// whereas the TokenAssembler accepts tokens pumped into it one step at a time. +// +// TokenAssemble does not enforce the "map keys must be strings" rule which is present in the Data Model; +// it will also happily do even recursive structures in map keys, +// meaning it can be used when handling schema values like maps with complex keys. +func TokenAssemble(na ipld.NodeAssembler, tr TokenReader, budget int) error { + tk, err := tr(&budget) + if err != nil { + return err + } + return tokenAssemble(na, tk, tr, &budget) +} + +func tokenAssemble(na ipld.NodeAssembler, tk *Token, tr TokenReader, budget *int) error { + if *budget < 0 { + return codec.ErrBudgetExhausted{} + } + switch tk.Kind { + case TokenKind_MapOpen: + if tk.Length > 0 && *budget < tk.Length*2 { // Pre-check budget: at least two decrements estimated for each entry. + return codec.ErrBudgetExhausted{} + } + ma, err := na.BeginMap(tk.Length) + if err != nil { + return err + } + for { + // Peek one token. We need to see if the map is about to end or not. + tk, err = tr(budget) + if err != nil { + return err + } + // If the map has ended, invoke the finish operation and check for any errors. + if tk.Kind == TokenKind_MapClose { + return ma.Finish() + } + // Recurse to assemble the key. + *budget-- // Decrement budget by at least one for each key. The key content may also cause further decrements. + if err = tokenAssemble(ma.AssembleKey(), tk, tr, budget); err != nil { + return err + } + // Recurse to assemble the value. + // (We don't really care to peek this token, but do so anyway to keep the calling convention regular.) + tk, err = tr(budget) + if err != nil { + return err + } + *budget-- // Decrement budget by at least one for each value. The value content may also cause further decrements. + if err = tokenAssemble(ma.AssembleValue(), tk, tr, budget); err != nil { + return err + } + // Continue around the loop, to encounter either the next entry or the end of the map. + } + case TokenKind_MapClose: + return ErrMalformedTokenSequence{"map close token encountered while not in the middle of a map"} + case TokenKind_ListOpen: + if tk.Length > 0 && *budget < tk.Length { // Pre-check budget: at least one decrement estimated for each entry. + return codec.ErrBudgetExhausted{} + } + la, err := na.BeginList(tk.Length) + if err != nil { + return err + } + for { + // Peek one token. We need to see if the list is about to end or not. + tk, err = tr(budget) + if err != nil { + return err + } + // If the list has ended, invoke the finish operation and check for any errors. + if tk.Kind == TokenKind_ListClose { + return la.Finish() + } + // Recurse to assemble the value. + *budget-- // Decrement budget by at least one for each value. The value content may also cause further decrements. + if err = tokenAssemble(la.AssembleValue(), tk, tr, budget); err != nil { + return err + } + // Continue around the loop, to encounter either the next value or the end of the list. + } + case TokenKind_ListClose: + return ErrMalformedTokenSequence{"list close token encountered while not in the middle of a list"} + case TokenKind_Null: + return na.AssignNull() + case TokenKind_Bool: + *budget-- + return na.AssignBool(tk.Bool) + case TokenKind_Int: + *budget-- + return na.AssignInt(int(tk.Int)) + case TokenKind_Float: + *budget-- + return na.AssignFloat(tk.Float) + case TokenKind_String: + *budget -= len(tk.Str) + return na.AssignString(tk.Str) + case TokenKind_Bytes: + *budget -= len(tk.Bytes) + return na.AssignBytes(tk.Bytes) + case TokenKind_Link: + *budget-- + return na.AssignLink(tk.Link) + default: + panic(fmt.Errorf("unrecognized token kind (%q?)", tk.Kind)) + } +} + +// --- the stepwise assembler system (more complicated; has a userland stack) is below --> + +type TokenAssembler struct { + // This structure is designed to be embeddable. Use Initialize when doing so. + + stk assemblerStack // this is going to end up being a stack you know + budget int64 +} + +type assemblerStackRow struct { + state uint8 // 0: assign this node; 1: continue list; 2: continue map with key; 3: continue map with value. + na ipld.NodeAssembler // Always present. + la ipld.ListAssembler // At most one of these is present. + ma ipld.MapAssembler // At most one of these is present. +} +type assemblerStack []assemblerStackRow + +func (stk assemblerStack) Tip() *assemblerStackRow { + return &stk[len(stk)-1] +} +func (stk *assemblerStack) Push(na ipld.NodeAssembler) { + *stk = append(*stk, assemblerStackRow{na: na}) +} +func (stk *assemblerStack) Pop() { + if len(*stk) == 0 { + return + } + *stk = (*stk)[0 : len(*stk)-1] +} + +func (ta *TokenAssembler) Initialize(na ipld.NodeAssembler, budget int64) { + if ta.stk == nil { + ta.stk = make(assemblerStack, 0, 10) + } else { + ta.stk = ta.stk[0:0] + } + ta.stk.Push(na) + ta.budget = budget +} + +// Process takes a Token pointer as an argument. +// (Notice how this function happens to match the definition of the visitFn that's usable as an argument to TokenWalk.) +// The token argument can be understood to be "borrowed" for the duration of the Process call, but will not be mutated. +// The use of a pointer here is so that a single Token can be reused by multiple calls, avoiding unnecessary allocations. +// +// Note that Process does very little sanity checking of token sequences itself, +// mostly handing information to the NodeAssemblers directly, +// which presumably will reject the data if it is out of line. +// The NodeAssembler this TokenAssembler is wrapping should already be enforcing the relevant logical rules, +// so it is not useful for TokenAssembler.Process to attempt to duplicate those checks; +// TokenAssembler.Process will also return any errors from the NodeAssembler without attempting to enforce a pattern on those errors. +// In particular, TokenAssembler.Process does not check if every MapOpen is paired with a MapClose; +// it does not check if every ListOpen is paired with a ListClose; +// and it does not check if the token stream is continuing after all open recursives have been closed. +// TODO: review this documentation; more of these checks turn out necessary anyway than originally expected. +func (ta *TokenAssembler) Process(tk *Token) (err error) { + if len(ta.stk) == 0 { + return io.EOF + } + tip := ta.stk.Tip() + switch tip.state { + case 0: + switch tk.Kind { + case TokenKind_MapOpen: + tip.ma, err = tip.na.BeginMap(tk.Length) + tip.state = 2 + return err + case TokenKind_MapClose: + // Mostly we try to just forward things, but can't not check this one: tip.ma would be nil; there's reasonable target for forwarding. + return ErrMalformedTokenSequence{"map close token encountered while not in the middle of a map"} + case TokenKind_ListOpen: + tip.la, err = tip.na.BeginList(tk.Length) + tip.state = 1 + return err + case TokenKind_ListClose: + // Mostly we try to just forward things, but can't not check this one: tip.la would be nil; there's reasonable target for forwarding. + return ErrMalformedTokenSequence{"list close token encountered while not in the middle of a list"} + case TokenKind_Null: + err = tip.na.AssignNull() + ta.stk.Pop() + return err + case TokenKind_Bool: + err = tip.na.AssignBool(tk.Bool) + ta.stk.Pop() + return err + case TokenKind_Int: + err = tip.na.AssignInt(int(tk.Int)) // TODO: upgrade all of ipld to use high precision int consistently + ta.stk.Pop() + return err + case TokenKind_Float: + err = tip.na.AssignFloat(tk.Float) + ta.stk.Pop() + return err + case TokenKind_String: + err = tip.na.AssignString(tk.Str) + ta.stk.Pop() + return err + case TokenKind_Bytes: + err = tip.na.AssignBytes(tk.Bytes) + ta.stk.Pop() + return err + case TokenKind_Link: + err = tip.na.AssignLink(tk.Link) + ta.stk.Pop() + return err + default: + panic(fmt.Errorf("unrecognized token kind (%q?)", tk.Kind)) + } + return nil + case 1: + if tk.Kind == TokenKind_ListClose { + err = tip.la.Finish() + ta.stk.Pop() + return err + } + ta.stk.Push(tip.la.AssembleValue()) + return ta.Process(tk) + case 2: + if tk.Kind == TokenKind_MapClose { + err = tip.ma.Finish() + ta.stk.Pop() + return err + } + tip.state = 3 + ta.stk.Push(tip.ma.AssembleKey()) + return ta.Process(tk) + case 3: + tip.state = 2 + ta.stk.Push(tip.ma.AssembleValue()) + return ta.Process(tk) + default: + panic("unreachable") + } +} diff --git a/codec/codectools/token_consumers_test.go b/codec/codectools/token_consumers_test.go new file mode 100644 index 00000000..ddeb34fe --- /dev/null +++ b/codec/codectools/token_consumers_test.go @@ -0,0 +1,39 @@ +package codectools + +import ( + "io" + "testing" + + . "github.com/warpfork/go-wish" +) + +func TestTokenAssemble(t *testing.T) { + for _, tcase := range tokenFixtures { + nb := tcase.value.Prototype().NewBuilder() + var readerOffset int + err := TokenAssemble(nb, func(budget *int) (*Token, error) { + if readerOffset > len(tcase.sequence) { + return nil, io.EOF + } + readerOffset++ + return &tcase.sequence[readerOffset-1], nil + }, 1<<10) + if err != nil { + t.Error(err) + } + Wish(t, nb.Build(), ShouldEqual, tcase.value) + } +} + +func TestTokenAssembler(t *testing.T) { + for _, tcase := range tokenFixtures { + nb := tcase.value.Prototype().NewBuilder() + var ta TokenAssembler + ta.Initialize(nb, 1<<10) + for _, tk := range tcase.sequence { + err := ta.Process(&tk) + Wish(t, err, ShouldEqual, nil) + } + Wish(t, nb.Build(), ShouldEqual, tcase.value) + } +} diff --git a/codec/codectools/token_producers.go b/codec/codectools/token_producers.go new file mode 100644 index 00000000..4f8ce2db --- /dev/null +++ b/codec/codectools/token_producers.go @@ -0,0 +1,286 @@ +package codectools + +import ( + "errors" + "fmt" + "io" + + "github.com/ipld/go-ipld-prime" +) + +// TokenWalk walks an ipld Node and repeatedly calls the visitFn, +// calling it once for every "token" yielded by the walk. +// Every map and list is yielded as a token at their beginning, +// and another token when they're finished; +// every scalar value (strings, bools, bytes, ints, etc) is yielded as a single token. +// +// The token pointer given to the visitFn will be identical on every call, +// but the data it contains will vary. +// The token may contain invalid data that is leftover from previous calls +// in some of its union fields; correct behavior requires looking at the +// token's Kind field before handling any of its other fields. +// +// If any error is returned by the visitFn, it will cause the walk to halt, +// and TokenWalk will return that error. +// However, if the error is the value TokenWalkSkip, and it's been returned +// when visitFn was called with a MapOpen or ListOpen token, the walk will +// skip forward over that entire map or list, and continue (with the +// next token being the close token that complements the open token). +// Returning a TokenWalkSkip when the token was any of the scalar kinds +// (e.g. anything other than a MapOpen or a ListOpen) has no effect. +// +// TokenAssembler is the rough dual of TokenWalk. +func TokenWalk(n ipld.Node, visitFn func(tk *Token) error) error { + // TokenWalk would be trivial to implement over NodeTokenizer, + // but we do a distinct implementation here because NodeTokenizer's resumable implementation means it needs a user-space stack, + // and to reuse that would require allocations which this method (since it's not resumable in the same way) can easily avoid (or at least, keep on the stack). + + var tk Token // For capture, once. + return tokenWalk(&tk, n, visitFn) +} + +func tokenWalk(tk *Token, n ipld.Node, visitFn func(*Token) error) error { + switch n.ReprKind() { + case ipld.ReprKind_Map: + tk.Kind = TokenKind_MapOpen + tk.Length = n.Length() + tk.Node = n + if err := visitFn(tk); err != nil { + return err + } + mitr := n.MapIterator() + for !mitr.Done() { + k, v, err := mitr.Next() + if err != nil { + return err + } + if err := tokenWalk(tk, k, visitFn); err != nil { + return err + } + if err := tokenWalk(tk, v, visitFn); err != nil { + return err + } + } + tk.Kind = TokenKind_MapClose + tk.Node = n + return visitFn(tk) + case ipld.ReprKind_List: + tk.Kind = TokenKind_ListOpen + tk.Length = n.Length() + tk.Node = n + if err := visitFn(tk); err != nil { + return err + } + litr := n.ListIterator() + for !litr.Done() { + _, v, err := litr.Next() + if err != nil { + return err + } + if err := tokenWalk(tk, v, visitFn); err != nil { + return err + } + } + tk.Kind = TokenKind_ListClose + tk.Node = n + return visitFn(tk) + case ipld.ReprKind_Null: + tk.Kind = TokenKind_Null + return visitFn(tk) + case ipld.ReprKind_Bool: + tk.Kind = TokenKind_Bool + tk.Bool, _ = n.AsBool() + return visitFn(tk) + case ipld.ReprKind_Int: + tk.Kind = TokenKind_Int + i, _ := n.AsInt() + tk.Int = int64(i) // TODO: upgrade all of ipld to use high precision int consistently + return visitFn(tk) + case ipld.ReprKind_Float: + tk.Kind = TokenKind_Float + tk.Float, _ = n.AsFloat() + return visitFn(tk) + case ipld.ReprKind_String: + tk.Kind = TokenKind_String + tk.Str, _ = n.AsString() + return visitFn(tk) + case ipld.ReprKind_Bytes: + tk.Kind = TokenKind_Bytes + tk.Bytes, _ = n.AsBytes() + return visitFn(tk) + case ipld.ReprKind_Link: + tk.Kind = TokenKind_Link + tk.Link, _ = n.AsLink() + return visitFn(tk) + default: + panic(fmt.Errorf("unrecognized node kind (%q?)", n.ReprKind())) + } + return nil +} + +var TokenWalkSkip = errors.New("token walk: skip") + +// --- the stepwise token producer system (more complicated; has a userland stack) is below --> + +// A TokenReader can be produced from any ipld.Node using NodeTokenizer. +// TokenReader are also commonly implemented by codec packages, +// wherein they're created over a serial data stream and tokenize that stream when pumped. +// +// TokenReader implementations are encouraged to yield the same token pointer repeatedly, +// just varying the contents of the value, in order to avoid unnecessary allocations. +// +// A 'budget' parameter must be provided to a TokenReader as a pointer to an integer. +// The TokenReader should limit how much memory it uses according to the budget remaining. +// (The budget is considered to be roughly in units of bytes, but can be treated as an approximation.) +// The budget should primarily be managed by the caller of the TokenReader +// (e.g., after the TokenReader returns a 20 byte string, the caller should decrement the budget by 20), +// but a TokenReader may also do its own decrements to the budget if some operations are particularly costly and the TokenReader wants this to be accounted for. +// The budget may be ignored if the TokenReader just yielding access to already in-memory information; +// the main intent of the budget is to avoid resource exhausting when bringing new data into program memory. +// +type TokenReader func(budget *int) (next *Token, err error) + +type NodeTokenizer struct { + // This structure is designed to be embeddable. Use Initialize when doing so. + + tk Token // We embed this to avoid allocations; we'll be repeatedly yielding a pointer to this piece of memory. + stk nodeTokenizerStack +} + +func (nt *NodeTokenizer) Initialize(n ipld.Node) { + if nt.stk == nil { + nt.stk = make(nodeTokenizerStack, 0, 10) + } else { + nt.stk = nt.stk[0:0] + } + nt.stk.Push(n) +} + +type nodeTokenizerStackRow struct { + state uint8 // 0: start this node; 1: continue list; 2: continue map with key; 3: continue map with value. + n ipld.Node // Always present. + litr ipld.ListIterator // At most one of these is present. + mitr ipld.MapIterator // At most one of these is present. + mval ipld.Node // The value to resume at when in state 3. + +} +type nodeTokenizerStack []nodeTokenizerStackRow + +func (stk nodeTokenizerStack) Tip() *nodeTokenizerStackRow { + return &stk[len(stk)-1] +} +func (stk *nodeTokenizerStack) Push(n ipld.Node) { + *stk = append(*stk, nodeTokenizerStackRow{n: n}) +} +func (stk *nodeTokenizerStack) Pop() { + if len(*stk) == 0 { + return + } + *stk = (*stk)[0 : len(*stk)-1] +} + +// ReadToken fits the TokenReader functional interface, and so may be used anywhere a TokenReader is required. +func (nt *NodeTokenizer) ReadToken() (next *Token, err error) { + // How stack depth works: + // - finding that you're starting to handle map or least leaves it the same; + // - before recursing to handle a child key or value, push stack; + // - any time you finish something, whether scalar or recursive, pop stack. + // This could be written differently: in particular, + // scalar leaves could be handled without increasing stack depth by that last increment. + // However, doing so would make for more complicated code. + // Maybe worth it; PRs welcome; benchmarks first. + if len(nt.stk) == 0 { + return nil, io.EOF + } + tip := nt.stk.Tip() + switch tip.state { + case 0: + switch tip.n.ReprKind() { + case ipld.ReprKind_Map: + nt.tk.Kind = TokenKind_MapOpen + nt.tk.Length = tip.n.Length() + nt.tk.Node = tip.n + tip.state = 2 + tip.mitr = tip.n.MapIterator() + return &nt.tk, nil + case ipld.ReprKind_List: + nt.tk.Kind = TokenKind_ListOpen + nt.tk.Length = tip.n.Length() + nt.tk.Node = tip.n + tip.state = 1 + tip.litr = tip.n.ListIterator() + return &nt.tk, nil + case ipld.ReprKind_Null: + nt.tk.Kind = TokenKind_Null + nt.stk.Pop() + return &nt.tk, nil + case ipld.ReprKind_Bool: + nt.tk.Kind = TokenKind_Bool + nt.tk.Bool, _ = tip.n.AsBool() + nt.stk.Pop() + return &nt.tk, nil + case ipld.ReprKind_Int: + nt.tk.Kind = TokenKind_Int + i, _ := tip.n.AsInt() + nt.tk.Int = int64(i) // TODO: upgrade all of ipld to use high precision int consistently + nt.stk.Pop() + return &nt.tk, nil + case ipld.ReprKind_Float: + nt.tk.Kind = TokenKind_Float + nt.tk.Float, _ = tip.n.AsFloat() + nt.stk.Pop() + return &nt.tk, nil + case ipld.ReprKind_String: + nt.tk.Kind = TokenKind_String + nt.tk.Str, _ = tip.n.AsString() + nt.stk.Pop() + return &nt.tk, nil + case ipld.ReprKind_Bytes: + nt.tk.Kind = TokenKind_Bytes + nt.tk.Bytes, _ = tip.n.AsBytes() + nt.stk.Pop() + return &nt.tk, nil + case ipld.ReprKind_Link: + nt.tk.Kind = TokenKind_Link + nt.tk.Link, _ = tip.n.AsLink() + nt.stk.Pop() + return &nt.tk, nil + default: + panic(fmt.Errorf("unrecognized node kind (%q?)", tip.n.ReprKind())) + } + case 1: + if tip.litr.Done() { + nt.tk.Kind = TokenKind_ListClose + nt.tk.Node = tip.n + nt.stk.Pop() + return &nt.tk, nil + } + _, v, err := tip.litr.Next() + if err != nil { + return nil, err + } + nt.stk.Push(v) + return nt.ReadToken() + case 2: + if tip.mitr.Done() { + nt.tk.Kind = TokenKind_MapClose + nt.tk.Node = tip.n + nt.stk.Pop() + return &nt.tk, nil + } + k, v, err := tip.mitr.Next() + if err != nil { + return nil, err + } + tip.mval = v + tip.state = 3 + nt.stk.Push(k) + return nt.ReadToken() + case 3: + tip.state = 2 + nt.stk.Push(tip.mval) + return nt.ReadToken() + default: + panic("unreachable") + } +} diff --git a/codec/codectools/token_producers_test.go b/codec/codectools/token_producers_test.go new file mode 100644 index 00000000..a86cffda --- /dev/null +++ b/codec/codectools/token_producers_test.go @@ -0,0 +1,42 @@ +package codectools + +import ( + "io" + "testing" + + . "github.com/warpfork/go-wish" +) + +func TestTokenWalk(t *testing.T) { + for _, tcase := range tokenFixtures { + var result []Token + err := TokenWalk(tcase.value, func(tk *Token) error { + result = append(result, *tk) + return nil + }) + if err != nil { + t.Error(err) + } + Wish(t, StringifyTokenSequence(result), ShouldEqual, StringifyTokenSequence(tcase.sequence)) + } +} + +func TestNodeTokenizer(t *testing.T) { + for _, tcase := range tokenFixtures { + var nt NodeTokenizer + var result []Token + nt.Initialize(tcase.value) + for { + tk, err := nt.ReadToken() + if err == nil { + result = append(result, *tk) + } else if err == io.EOF { + break + } else { + t.Error(err) + break + } + } + Wish(t, StringifyTokenSequence(result), ShouldEqual, StringifyTokenSequence(tcase.sequence)) + } +} diff --git a/codec/codectools/token_test.go b/codec/codectools/token_test.go new file mode 100644 index 00000000..fdac573b --- /dev/null +++ b/codec/codectools/token_test.go @@ -0,0 +1,70 @@ +package codectools + +import ( + "github.com/ipld/go-ipld-prime" + "github.com/ipld/go-ipld-prime/fluent" + "github.com/ipld/go-ipld-prime/must" + basicnode "github.com/ipld/go-ipld-prime/node/basic" +) + +var tokenFixtures = []struct { + value ipld.Node + sequence []Token +}{ + { + value: must.Node(fluent.Reflect(basicnode.Prototype.Any, + "a scalar", + )), + sequence: []Token{ + {Kind: TokenKind_String, Str: "a scalar"}, + }, + }, + { + value: must.Node(fluent.Reflect(basicnode.Prototype.Any, + map[string]interface{}{ + "a": "b", + "c": "d", + }, + )), + sequence: []Token{ + {Kind: TokenKind_MapOpen, Length: 2}, + /**/ {Kind: TokenKind_String, Str: "a"}, {Kind: TokenKind_String, Str: "b"}, + /**/ {Kind: TokenKind_String, Str: "c"}, {Kind: TokenKind_String, Str: "d"}, + {Kind: TokenKind_MapClose}, + }, + }, + { + value: must.Node(fluent.Reflect(basicnode.Prototype.Any, + map[string]interface{}{ + "a": 1, + "b": map[string]interface{}{ + "c": "d", + }, + }, + )), + sequence: []Token{ + {Kind: TokenKind_MapOpen, Length: 2}, + /**/ {Kind: TokenKind_String, Str: "a"}, {Kind: TokenKind_Int, Int: 1}, + /**/ {Kind: TokenKind_String, Str: "b"}, {Kind: TokenKind_MapOpen, Length: 1}, + /**/ /**/ {Kind: TokenKind_String, Str: "c"}, {Kind: TokenKind_String, Str: "d"}, + /**/ {Kind: TokenKind_MapClose}, + {Kind: TokenKind_MapClose}, + }, + }, + { + value: must.Node(fluent.Reflect(basicnode.Prototype.Any, + []interface{}{ + "a", + "b", + "c", + }, + )), + sequence: []Token{ + {Kind: TokenKind_ListOpen, Length: 3}, + /**/ {Kind: TokenKind_String, Str: "a"}, + /**/ {Kind: TokenKind_String, Str: "b"}, + /**/ {Kind: TokenKind_String, Str: "c"}, + {Kind: TokenKind_ListClose}, + }, + }, +} diff --git a/codec/codectools/token_util.go b/codec/codectools/token_util.go new file mode 100644 index 00000000..4dc2c54f --- /dev/null +++ b/codec/codectools/token_util.go @@ -0,0 +1,44 @@ +package codectools + +import ( + "strings" +) + +// Normalize sets any value in the token to its zero value if it's not applicable for the token's kind. +// E.g., if the token kind is string, the float, bytes, and etc fields are all zero'd. +// Path and offset progress information is left unmodified. +// This is sometimes helpful in writing test fixtures and equality assertions. +func (tk *Token) Normalize() { + if tk.Kind != TokenKind_MapOpen && tk.Kind != TokenKind_ListOpen { + tk.Length = 0 + } + if tk.Kind != TokenKind_Bool { + tk.Bool = false + } + if tk.Kind != TokenKind_Int { + tk.Int = 0 + } + if tk.Kind != TokenKind_Float { + tk.Float = 0 + } + if tk.Kind != TokenKind_String { + tk.Str = "" + } + if tk.Kind != TokenKind_Bytes { + tk.Bytes = nil + } + if tk.Kind != TokenKind_Link { + tk.Link = nil + } +} + +// StringifyTokenSequence is utility function often handy for testing. +// (Doing a diff on strings of tokens gives very good reports for minimal effort.) +func StringifyTokenSequence(seq []Token) string { + var sb strings.Builder + for _, tk := range seq { + sb.WriteString(tk.String()) + sb.WriteByte('\n') + } + return sb.String() +}