Add tokenStats some tests, and some error checks, also docs and readm…

…e updates maybe we should host jsdoc on github pages also performed npm audit and updates for security vulnerabilities.
syonfox · Dec 25, 2022 · 65a3181 · 65a3181
1 parent 63260ae
commit 65a3181
Show file tree

Hide file tree

Showing 7 changed files with 4,233 additions and 3,408 deletions.
diff --git a/Encoder.js b/Encoder.js
@@ -85,21 +85,21 @@ const bpe_ranks = dictZip(bpe_merges, range(0, bpe_merges.length))
 const cache = new Map;
 
 /**
- * This function appears to implement the Byte Pair Encoding (BPE) algorithm for subword tokenization.
+ * Implements the Byte Pair Encoding (BPE) algorithm for subword tokenization.
  *
  * The BPE algorithm operates on a vocabulary of subwords, and works by iteratively replacing the most frequent pair of
  * subwords in the vocabulary with a new subword, until a specified vocabulary size is reached. This results in a
- * vocabulary of subwords that can be used to represent words in a language, while still maintaining some of the
- * structure and meaning of the original words.
+ * of subwords that can be used to represent words in a language, while still maintaining some of the structure and
+ * meaning of the original words.
  *
  * Here's a breakdown of the function:
  *  1 The function first checks if the input token is in the cache, and if it is, it returns the cached value. This is likely to improve performance by avoiding unnecessary processing for tokens that have already been processed.
  *  2 The input token is then split into individual characters, and a list of pairs of adjacent characters (bigrams) is generated using the get_pairs function. If there are no pairs, the input token is returned as is.
  *  3 The function then enters a loop that continues until a termination condition is met. In each iteration, the pair of subwords with the lowest rank (as determined by the bpe_ranks object) is identified and stored in the bigram variable. If the bigram is not in bpe_ranks, the loop terminates.
  *  4 The bigram is then replaced with a new subword in the word list. The word list is iterated over and any instances of the bigram are replaced with the new subword.
  *  5 The word list is then joined back into a string and stored in the cache. The cached string is returned as the result of the function.
- * @param token
- * @return {string|*}
+ * @param {string} token - The input token to be tokenized.
+ * @return {string} word - The tokenized subwords as a string.
  */
 function bpe(token) {
   if (cache.has(token)) {
@@ -169,7 +169,21 @@ function bpe(token) {
   return word
 }
 
+/**
+ * Encodes a given text string into a list of BPE tokens.
+ *
+ * @param {string} text - The text to be encoded.
+ * @return {Array} bpe_tokens - The encoded BPE tokens.
+ */
 function encode(text) {
+  if(typeof text != "string") {
+    if(typeof text == "undefined") {
+      console.warn("undefined text returning empty []");
+      return [];
+    }
+    console.warn("casting to string hope thats what you want!");
+    text = ""+text;
+  }
   let bpe_tokens = []
   const matches = Array.from(text.matchAll(pat)).map(x => x[0])
   for (let token of matches) {
@@ -183,10 +197,57 @@ function encode(text) {
   return bpe_tokens
 }
 
-// This function works by iterating through the matches of the pat pattern in the input text,
-// encoding each match using the encodeStr function and the byte_encoder mapping,
-// and then applying the bpe function to the encoded token. The number of tokens produced by the bpe function is then added to the count variable.
-// Finally, the count variable is returned as the result.
+/**
+ * Computes count, unique, and frequency statistics for a string or an array of tokens.
+ *
+ * @param {(string|Array<number>)} input - The input string or array of tokens.
+ * @return {Object} stats - An object with count, unique, and frequency properties.
+ *
+ * @property {number} stats.count - The total number of tokens.
+ * @property {number} stats.unique - The number of unique tokens.
+ * @property {Object} stats.frequency - An object with token-frequency pairs, sorted by frequency in descending order.
+ */
+function tokenStats(input) {
+  let tokens
+  if (typeof input === 'string') {
+    // Encode the string into tokens
+    tokens = encode(input)
+  } else {
+    tokens = input
+  }
+
+  const stats = {
+    count: tokens.length,
+    unique: new Set(tokens).size,
+    frequency: {}
+  }
+
+  // Compute the frequency of each token
+  for (let token of tokens) {
+    if (stats.frequency[token]) {
+      stats.frequency[token]++
+    } else {
+      stats.frequency[token] = 1
+    }
+  }
+
+  // Sort the frequency object by frequency in descending order
+  stats.frequency = Object.fromEntries(
+      Object.entries(stats.frequency).sort((a, b) => b[1] - a[1])
+  )
+
+  return stats
+}
+
+
+/**
+ *  This function works by iterating through the matches of the pat pattern in the input text,
+ *  encoding each match using the encodeStr function and the byte_encoder mapping,
+ *  and then applying the bpe function to the encoded token. The number of tokens produced by the bpe function is then added to the count variable.
+ *  Finally, the count variable is returned as the result.
+ * @param text
+ * @return {number}
+ */
 function countTokens(text) {
   let count = 0
   const matches = Array.from(text.matchAll(pat)).map(x => x[0])
@@ -200,7 +261,17 @@ function countTokens(text) {
   return count
 }
 
+/**
+ * Decodes a list of BPE tokens into a text string.
+ *
+ * @param {Array} tokens - The list of BPE tokens to be decoded.
+ * @return {string} text - The decoded text string.
+ */
 function decode(tokens) {
+  if(!tokens) {
+    console.warn("No tokens to decode, returning empty string")
+    return "";
+  }
   let text = tokens.map(x => decoder[x]).join('')
   text = decodeStr(text.split('').map(x => byte_decoder[x]))
   return text
@@ -209,5 +280,6 @@ function decode(tokens) {
 module.exports = {
   encode,
   decode,
-  countTokens
+  countTokens,
+  tokenStats
 };
diff --git a/Encoder.test.js b/Encoder.test.js
@@ -1,44 +1,109 @@
-const {encode, decode} = require('./Encoder.js');
+const {encode, decode, countTokens, tokenStats} = require('./Encoder.js');
+const crypto = require('crypto');
+
+// Generate a random string of a given length
+function generateRandomString(length) {
+    return crypto.randomBytes(length).toString('hex');
+}
+
 
 test('empty string', () => {
-  const str = "";
-  expect(encode(str)).toEqual([])
-  expect(decode(encode(str))).toEqual(str)
+    const str = "";
+    expect(encode(str)).toEqual([])
+    expect(decode(encode(str))).toEqual(str)
 });
 
 test('space', () => {
-  const str = " ";
-  expect(encode(str)).toEqual([220])
-  expect(decode(encode(str))).toEqual(str)
+    const str = " ";
+    expect(encode(str)).toEqual([220])
+    expect(decode(encode(str))).toEqual(str)
 });
 
 test('tab', () => {
-  const str = "\t";
-  expect(encode(str)).toEqual([197])
-  expect(decode(encode(str))).toEqual(str)
+    const str = "\t";
+    expect(encode(str)).toEqual([197])
+    expect(decode(encode(str))).toEqual(str)
 });
 
 test('simple text', () => {
-  const str = "This is some text";
-  expect(encode(str)).toEqual([1212, 318, 617, 2420])
-  expect(decode(encode(str))).toEqual(str)
+    const str = "This is some text";
+    expect(encode(str)).toEqual([1212, 318, 617, 2420])
+    expect(decode(encode(str))).toEqual(str)
 });
 
 test('multi-token word', () => {
-  const str = "indivisible";
-  expect(encode(str)).toEqual([521, 452, 12843])
-  expect(decode(encode(str))).toEqual(str)
+    const str = "indivisible";
+    expect(encode(str)).toEqual([521, 452, 12843])
+    expect(decode(encode(str))).toEqual(str)
 });
 
 test('emojis', () => {
-  const str = "hello 👋 world 🌍";
-  expect(encode(str)).toEqual([31373, 50169, 233, 995, 12520, 234, 235])
-  expect(decode(encode(str))).toEqual(str)
+    const str = "hello 👋 world 🌍";
+    expect(encode(str)).toEqual([31373, 50169, 233, 995, 12520, 234, 235])
+    expect(decode(encode(str))).toEqual(str)
 });
 
-test('properties of Object',()=>{
-	const str = "toString constructor hasOwnProperty valueOf";
+test('properties of Object', () => {
+    const str = "toString constructor hasOwnProperty valueOf";
+
+    expect(encode(str)).toEqual([1462, 10100, 23772, 468, 23858, 21746, 1988, 5189]);
+    expect(decode(encode(str))).toEqual(str);
+})
+
+
+test('Random encode=decode count', () => {
+    let n = 200
+    let str
+
+    let t = {
+        c:0,e:0,d:0,l: 0,f:0
+    }
+    for (let i = 0; i < n; i++) {
+        const randomNumber = Math.floor(Math.random() * (2 * n + 1)) + n;
+        str = generateRandomString(randomNumber);
+        t.l+= randomNumber;
+        let now = Date.now()
+        let _fe = encode(str);
+        t.f += Date.now()-now; now = Date.now();
+        let count = countTokens(str);
+        t.c += Date.now()-now; now = Date.now();
+        let e = encode(str);
+        t.e += Date.now()-now; now = Date.now();
+        let d = decode(e);
+        t.d += Date.now()-now; now = Date.now();
+        expect(d).toEqual(str);
+        expect(e.length).toEqual(count);
+
+    }
+
+    console.log(`Timings for chars(${t.l}): fencode: ${t.f}, counting: ${t.c}, encoding: ${t.e}, decoding:${t.d}`)
+
+
+    // const str = "toString constructor hasOwnProperty valueOf";
+    // expect(encode(str).length).toEqual(countTokens(str));
+})
+
+test('empty encode', () => {
+    expect(encode()).toEqual([]);
+
+})
+test('null encode', () => {
+    expect(encode(null)).toEqual(encode("null"));
+
+})
+test('empty decode', () => {
+    expect(decode()).toEqual("");
+
+})
+
+test('stats test', () => {
+    const str = "hello 👋 world 🌍, im a foo your a foo, everwer where a foo foo";
 
-	expect(encode(str)).toEqual([1462, 10100, 23772, 468, 23858, 21746, 1988, 5189]);
-	expect(decode(encode(str))).toEqual(str);
-})
+    let e = encode(str);
+    let stats = tokenStats(e);
+    console.log("example stats: ", stats);
+    expect(tokenStats(e)).toEqual(tokenStats(str))
+    expect(decode(encode(str))).toEqual(str)
+    // const str = "toString constructor hasOwnProperty valueOf";
+    // expect(encode(str).length).toEqual(countTokens(str));
+})
diff --git a/README.md b/README.md
@@ -1,6 +1,9 @@
-# This is a fork of https://github.com/latitudegames/GPT-3-Encoder. I made this fork so I could apply some PRs that had been sent to the upstream repo.
+#### This is a fork of https://github.com/latitudegames/GPT-3-Encoder. I made this fork so I could apply some PRs that had been sent to the upstream repo.
+
+    changelog: 
+        add countTokens function
+        updated docs (npm run docs)
 
-~~~
 
 # GPT-3-Encoder
 Javascript BPE Encoder Decoder for GPT-2 / GPT-3
@@ -11,15 +14,19 @@ GPT-2 and GPT-3 use byte pair encoding to turn text into a series of integers to
 ## Install with npm
 
 ```
-npm install gpt-3-encoder
+npm install @syonfox/gpt-3-encoder
 ```
 
+
 ## Usage
 
 Compatible with Node >= 12
 
 ```js
-const {encode, decode} = require('gpt-3-encoder')
+
+import {encode, decode, countTokens, tokenStats} from "gpt-3-encoder"
+//or
+const {encode, decode, countTokens, tokenStats} = require('gpt-3-encoder')
 
 const str = 'This is an example sentence to try encoding out on!'
 const encoded = encode(str)
@@ -30,9 +37,54 @@ for(let token of encoded){
   console.log({token, string: decode([token])})
 }
 
+//example count tokens usage
+if(countTokens(str) > 5) {
+    console.log("String is over five tokens, inconcevable");
+}
+
 const decoded = decode(encoded)
 console.log('We can decode it back into:\n', decoded)
 
 ```
 
 
+## Developers 
+
+```sh
+git clone https://github.com/syonfox/GPT-3-Encoder.git
+
+cd GPT-3-Encoder
+
+npm install
+
+npm run test
+npm run docs
+
+less Encoder.js
+
+firefox ./docs/index.html
+
+npm publish
+```
+
+## todo 
+
+More stats that work well with this token representation.
+
+Clean up and keep it simple. 
+
+more tests.
+
+performance analysis 
+
+There are several performance improvements that could be made to the encode function:
+(from gpt todo vet these recommendations)
+
+    Cache the results of the encodeStr function to avoid unnecessary computation. You can do this by using a map or an object to store the results of encodeStr for each input string.
+    Use a regular expression to match the tokens in the input text instead of using the matchAll function. Regular expressions can be faster and more efficient than matchAll for certain types of patterns.
+    Use a different data structure to store the byte_encoder and encoder maps. Objects and maps can have different performance characteristics depending on the size and complexity of the data. You may want to experiment with different data structures to see which one works best for your use case.
+    Use a different data structure to store the bpe_tokens array. Arrays can be slower than other data structures for certain operations, such as appending new elements or concatenating multiple arrays. You may want to consider using a different data structure, such as a linked list or a queue, to store the bpe_tokens array.
+    Use a different algorithm to compute the BPE codes for the tokens. The current implementation of the bpe function may be inefficient for large datasets or for datasets with complex patterns. You may want to consider using a different algorithm, such as a divide-and-conquer or a hashing-based approach, to compute the BPE codes more efficiently.
+
+
+
diff --git a/index.d.ts b/index.d.ts
@@ -1,7 +1,10 @@
 declare module "gpt-3-encoder" {
-  export function encode(text: string): number[];
+    export function encode(text: string): number[];
+
+    export function decode(tokens: number[]): string;
+
+    export function countTokens(text: string): number;
+
+    export function tokenStats(input: string | number[]): any;
 
-  export function decode(tokens: number[]): string;
-
-  export function countTokens(text: string): number;
 }
diff --git a/index.js b/index.js
@@ -1,7 +1,8 @@
-const { encode, decode, countTokens } = require("./Encoder");
+const { encode, decode, countTokens, tokenStats } = require("./Encoder");
 
 module.exports = {
   encode,
   decode,
   countTokens,
+  tokenStats
 };