trim duplicate logic

storacha · Gozala · Jul 19, 2023 · Jul 19, 2023 · Jul 19, 2023 · Jul 19, 2023
commit 14a27ff076e8c7a058118eecf728e1121285af8f
diff --git a/src/multihash.js b/src/multihash.js
@@ -37,6 +37,8 @@ export const MAX_HEIGHT = 255
 export const MAX_PAYLOAD_SIZE = 2 ** 255 * FR_RATIO
 
 /**
+ * Computes the digest of the given payload.
+ *
  * @param {Uint8Array} payload
  * @returns {StreamDigest}
  */
@@ -47,6 +49,9 @@ export const digest = (payload) => {
 }
 
 /**
+ * Creates a streaming hasher that can be used to consumer larger streams
+ * of data than it would be practical to load into memory all at once.
+ *
  * @returns {API.StreamingHasher<typeof code, typeof size, StreamDigest>}
  */
 export const create = () => new Hasher()
@@ -59,7 +64,7 @@ export const create = () => new Hasher()
 class Hasher {
   constructor() {
     /**
-     * The number of bytes written into the hasher.
+     * The number of bytes consumed by the hasher.
      *
      * @private
      */
@@ -86,20 +91,33 @@ class Hasher {
     this.offset = 0
 
     /**
-     * The layers of the tree. Each layer will contain the
+     * The layers of the tree. Each layer will contain either 0 or 1 nodes
+     * between writes. When we write into a hasher, if we have enough nodes
+     * leaves will be created and pushed into the `layers[0]` array, after
+     * which we flush and combining every two leaf into a node which is moved
+     * to the next layer. This process is repeated until we reach the top
+     * layer, leaving each layer either empty or with a single node.
      *
      * @type {Layers}
      */
     this.layers = [[]]
   }
 
+  /**
+   * Return the total number of bytes written into the hasher. Calling
+   * {@link reset} will reset the hasher and the count will be reset to 0.
+   *
+   * @returns {bigint}
+   */
   count() {
     return this.bytesWritten
   }
 
   /**
-   *  Digest collapses the internal hash state and returns the resulting raw 32
-   * bytes of commP
+   * Computes the digest of all the data that has been written into this hasher.
+   * This method does not have side-effects, meaning that you can continue
+   * writing and call this method again to compute digest of all the data
+   * written from the very beginning.
    */
   digest() {
     const buffer = new Uint8Array(MULTIHASH_SIZE)
@@ -108,26 +126,43 @@ class Hasher {
   }
 
   /**
+   * Computes the digest and writes into the given buffer. You can provide
+   * optional `byteOffset` to write digest at that offset in the buffer. By
+   * default the multihash prefix will be written into the buffer, but you can
+   * opt-out by passing `false` as the `asMultihash` argument.
    *
    * @param {Uint8Array} output
    * @param {number} [byteOffset]
    * @param {boolean} asMultihash
    */
   digestInto(output, byteOffset = 0, asMultihash = true) {
     const { buffer, layers, offset } = this
-    // If we have remaining bytes in the buffer we pad with zeros and turn
-    // them into leaf nodes. Note that it is safe to mutate the buffer here
-    // as bytes past `offset` are considered dirty.
-    const nodes = offset > 0 ? split(pad(buffer.fill(0, offset))) : undefined
-    const { root, height } = computedRoot(layers, nodes)
 
+    // We do not want to mutate the layers, so we create a shallow copy of it
+    // which we will use to compute the root.
+    let [leaves, ...nodes] = layers
+
+    // If we have some bytes in the buffer we fill rest with zeros and compute
+    // leaves from it. Note that it is safe to mutate the buffer here as bytes
+    // past `offset` are considered dirty and should not be read.
+    if (offset > 0) {
+      leaves = [...leaves, ...split(pad(buffer.fill(0, offset)))]
+    }
+
+    const tree = build([leaves, ...nodes])
+    const height = tree.length - 1
+    const [root] = tree[height]
+
+    // Write the multihash prefix if requested
     if (asMultihash) {
       output.set(PREFIX, byteOffset)
       byteOffset += PREFIX.length
     }
 
+    // Write the tree height as the first byte of the digest
     output[byteOffset] = height
     byteOffset += 1
+    // Write the root as the remaining 32 bytes of the digest
     output.set(root, byteOffset)
 
     return this
@@ -140,6 +175,7 @@ class Hasher {
     const { buffer, offset, layers } = this
     const leaves = layers[0]
     const { length } = bytes
+    // If we got no bytes there is nothing to do here
     if (length === 0) {
       return this
       /* c8 ignore next 5 */
@@ -148,66 +184,59 @@ class Hasher {
         `Writing ${length} bytes exceeds max payload size of ${MAX_PAYLOAD_SIZE}`
       )
     }
-    // If we do not have enough bytes to fill a quad, just add them to the
-    // buffer
+    // If we do not have enough bytes to form a quad, just add append new bytes
+    // to the buffer and return.
     else if (offset + length < buffer.length) {
       buffer.set(bytes, offset)
       this.offset += length
       this.bytesWritten += BigInt(length)
       return this
     }
-    // If we are here we have more or equal number of bytes to fill the buffer
-    // in which case we fill it and process the rest.
+    // Otherwise we first fill the buffer to form a quad and create some leaves.
+    // Then we slice remaining bytes into quads sized chunks and create leaves
+    // from them. If we have some bytes left we copy them into the buffer and
+    // flush to combining node pairs and propagate them up the tree.
     else {
-      // Number of bytes required to fill the buffer
+      // Number of bytes required to fill the quad buffer
       const bytesRequired = buffer.length - offset
-      // Fill the remainder of the buffer from the given bytes and then
-      // create leaf from it
+      // copy required bytes into the buffer and turn them into leaves
+      // which we push into the leaf layer.
       buffer.set(bytes.subarray(0, bytesRequired), offset)
       leaves.push(...split(pad(buffer)))
 
+      // Now we slice remaining bytes into quads, create leaves from them
+      // and push them into the leaf layer.
       let readOffset = bytesRequired
-      // Rest of the bytes are also sliced into quads and
       while (readOffset + IN_BYTES_PER_QUAD < length) {
         const quad = bytes.subarray(readOffset, readOffset + IN_BYTES_PER_QUAD)
         leaves.push(...split(pad(quad)))
         readOffset += IN_BYTES_PER_QUAD
       }
 
-      // Remaining bytes are copied into the buffer
+      // Whatever byte were left are copied into the buffer and we update
+      // the offset to reflect that.
       this.buffer.set(bytes.subarray(readOffset), 0)
       this.offset = length - readOffset
 
-      this.flush()
-
+      // We also update the total number of bytes written.
       this.bytesWritten += BigInt(length)
+
+      // Now prune the layers to propagate all the new leaves up the tree.
+      prune(this.layers)
+
       return this
     }
   }
-  flush() {
-    const { layers } = this
-    let height = 0
-    while (height < layers.length) {
-      const layer = layers[height]
-      height += 1
-      let index = 0
-      while (index + 1 < layer.length) {
-        const node = Proof.computeNode(layer[index], layer[index + 1])
-        if (this.layers.length <= height) {
-          this.layers[height] = [node]
-        } else {
-          this.layers[height].push(node)
-        }
-        index += 2
-      }
-      layer.splice(0, index)
-    }
-  }
 
+  /**
+   * Resets this hasher to its initial state so it could be recycled as new
+   * instance.
+   */
   reset() {
     this.offset = 0
     this.bytesWritten = 0n
-    this.layers = [[]]
+    this.layers.length = 1
+    this.layers[0].length = 0
     return this
   }
 
@@ -260,45 +289,73 @@ class Digest {
 }
 
 /**
+ * Prunes layers by combining node pairs into nodes in the next layer and
+ * removing them from the layer that they were in. After pruning each layer
+ * will end up with at most one node. New layers may be created in the process
+ * when nodes from the top layer are combined.
+ *
+ * @param {Layers} layers
+ */
+const prune = (layers) => flush(layers, false)
+
+/**
+ * Flushes all the nodes in layers by combining node pairs into nodes in the
+ * next layer. Layers with only one node are combined with zero padded nodes
+ * (corresponding to the level of the layer). Unlike {@link prune} combined
+ * nodes are not removed and layers are copied instead of been mutated.
+ *
  * @param {Layers} layers
- * @param {API.MerkleTreeNode[]} [newNodes]
  */
-const computedRoot = (layers, newNodes = []) => {
+const build = (layers) => flush([...layers], true)
+
+/**
+ * @param {Layers} layers
+ * @param {boolean} build
+ * @returns {Layers}
+ */
+const flush = (layers, build) => {
   // Note it is important that we do not mutate any of the layers otherwise
-  // calling digest() will have a side-effect and produce wrong results.
-  let height = 0
-  while (height < layers.length || newNodes.length > 1) {
-    const layer = layers[height] ?? []
-    const nodes = newNodes.length ? [...layer, ...newNodes] : layer
-    // We already copied the nodes from the previous layer so we can clear it
-    // here in order to accumulate the new nodes for the next layer.
-    newNodes.length = 0
+  // writing more data into the hasher and computing the digest will produce
+  // wrong results.
+  let level = 0
+  // We will walk up the tree until we reach the top layer. However, we may end
+  // up with creating new layers in the process, so we will keep track of the
+  while (level < layers.length) {
+    let next = layers[level + 1]
+    const layer = layers[level]
 
     // If we have the odd number of nodes and we have not reached the top
-    // layer, we have a bug in the code and we throw an error.
-    if (nodes.length % 2 > 0 && height + 1 < layers.length) {
-      nodes.push(ZeroPad.fromLevel(height))
+    // layer, we push a zero padding node corresponding to the current level.
+    if (build && layer.length % 2 > 0 && next) {
+      layer.push(ZeroPad.fromLevel(level))
     }
 
-    // If we have 0 nodes in the current layer we just move up the tree.
-    if (nodes.length === 0) {
-      height += 1
-    } else {
+    level += 1
+
+    // If we have 0 nodes in the current layer we just move to the next one.
+    if (layer.length) {
+      // If we have a next layer and we are building  will combine nodes from the current layer
+      next = next ? (build ? [...next] : next) : []
       let index = 0
       // Note that we have checked that we have an even number of nodes so
       // we will never end up with an extra node when consuming two at a time.
-      while (index + 1 < nodes.length) {
-        const left = nodes[index]
-        const right = nodes[index + 1]
-        const node = Proof.computeNode(left, right)
-        newNodes.push(node)
+      while (index + 1 < layer.length) {
+        const node = Proof.computeNode(layer[index], layer[index + 1])
+        next.push(node)
         index += 2
       }
-      height += 1
+
+      if (next.length) {
+        layers[level] = next
+      }
+
+      if (!build) {
+        // we remove nodes that we have combined from the current layer to reduce
+        // memory overhead and move to the next layer.
+        layer.splice(0, index)
+      }
     }
   }
 
-  return newNodes.length
-    ? { root: newNodes[0], height }
-    : { root: layers[layers.length - 1][0], height: height - 1 }
+  return layers
 }