Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/patch-1' into syonfox-dev
Browse files Browse the repository at this point in the history
# Conflicts:
#	Encoder.js
  • Loading branch information
syonfox committed Dec 25, 2022
2 parents ea8540d + 473daf7 commit 3b496fb
Showing 1 changed file with 20 additions and 2 deletions.
22 changes: 20 additions & 2 deletions Encoder.js
Original file line number Diff line number Diff line change
Expand Up @@ -176,13 +176,30 @@ function encode(text) {
token = encodeStr(token).map(x => {
return byte_encoder[x]
}).join('')

const new_tokens = bpe(token).split(' ').map(x => encoder[x])
bpe_tokens = bpe_tokens.concat(new_tokens)
}
return bpe_tokens
}

// This function works by iterating through the matches of the pat pattern in the input text,
// encoding each match using the encodeStr function and the byte_encoder mapping,
// and then applying the bpe function to the encoded token. The number of tokens produced by the bpe function is then added to the count variable.
// Finally, the count variable is returned as the result.
function countTokens(text) {
let count = 0
const matches = Array.from(text.matchAll(pat)).map(x => x[0])
for (let token of matches) {
token = encodeStr(token).map(x => {
return byte_encoder[x]
}).join('')

count += bpe(token).split(' ').length
}
return count
}

function decode(tokens) {
let text = tokens.map(x => decoder[x]).join('')
text = decodeStr(text.split('').map(x => byte_decoder[x]))
Expand All @@ -191,5 +208,6 @@ function decode(tokens) {

module.exports = {
encode,
decode
decode,
countTokens
};

0 comments on commit 3b496fb

Please sign in to comment.