Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

buffer added to avoid splitted chatacter #122

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 49 additions & 4 deletions Examples/WhisperDesktop/Utils/logger.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "stdafx.h"
#include "logger.h"
#include "miscUtils.h"
#include <string>

namespace
{
Expand Down Expand Up @@ -37,6 +38,37 @@ void printTime( CStringA& rdi, Whisper::sTimeSpan time, bool comma )
fields.ticks / 10'000 );
}

bool utf8_check_is_valid(const char *str, int len) {
// based on https://gist.github.com/ichramm/3ffeaf7ba4f24853e9ecaf176da84566
int n;
for (int i = 0; i < len; ++i) {
unsigned char c = (unsigned char) str[i];
//if (c==0x09 || c==0x0a || c==0x0d || (0x20 <= c && c <= 0x7e) ) n = 0; // is_printable_ascii
if (0x00 <= c && c <= 0x7f) {
n=0; // 0bbbbbbb
} else if ((c & 0xE0) == 0xC0) {
n=1; // 110bbbbb
} else if ( c==0xed && i<(len-1) && ((unsigned char)str[i+1] & 0xa0)==0xa0) {
return false; //U+d800 to U+dfff
} else if ((c & 0xF0) == 0xE0) {
n=2; // 1110bbbb
} else if ((c & 0xF8) == 0xF0) {
n=3; // 11110bbb
//} else if (($c & 0xFC) == 0xF8) { n=4; // 111110bb //byte 5, unnecessary in 4 byte UTF-8
//} else if (($c & 0xFE) == 0xFC) { n=5; // 1111110b //byte 6, unnecessary in 4 byte UTF-8
} else {
return false;
}

for (int j = 0; j < n && i < len; ++j) { // n bytes matching 10bbbbbb follow ?
if ((++i == len) || (( (unsigned char)str[i] & 0xC0) != 0x80)) {
return false;
}
}
}
return true;
}

HRESULT logNewSegments( const iTranscribeResult* results, size_t newSegments, bool printSpecial )
{
sTranscribeLength length;
Expand All @@ -49,6 +81,7 @@ HRESULT logNewSegments( const iTranscribeResult* results, size_t newSegments, bo
const sToken* const tokens = results->getTokens();

CStringA str;
std::string buffer;
for( ; i < len; i++ )
{
const sSegment& seg = segments[ i ];
Expand All @@ -62,10 +95,22 @@ HRESULT logNewSegments( const iTranscribeResult* results, size_t newSegments, bo
{
const sToken& tok = tokens[ seg.firstToken + j ];
if( !printSpecial && ( tok.flags & eTokenFlags::Special ) )
continue;
str += k_colors[ colorIndex( tok ) ];
str += tok.text;
str += "\033[0m";
continue;
if (utf8_check_is_valid(tok.text, strlen(tok.text))) {
str += k_colors[ colorIndex( tok ) ];
str += tok.text;
str += "\033[0m";
} else {
for (int k = 0; k < strlen(tok.text); k++) {
buffer.push_back(tok.text[k]);
if (utf8_check_is_valid(&buffer[0], buffer.size())) {
str += k_colors[ colorIndex( tok ) ];
str += &buffer[0];
str += "\033[0m";
buffer.clear();
}
}
}
}
logInfo( u8"%s", cstr( str ) );
}
Expand Down
57 changes: 55 additions & 2 deletions Examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,37 @@ namespace
return col;
}

static bool utf8_check_is_valid(const char *str, int len) {
// based on https://gist.github.com/ichramm/3ffeaf7ba4f24853e9ecaf176da84566
int n;
for (int i = 0; i < len; ++i) {
unsigned char c = (unsigned char) str[i];
//if (c==0x09 || c==0x0a || c==0x0d || (0x20 <= c && c <= 0x7e) ) n = 0; // is_printable_ascii
if (0x00 <= c && c <= 0x7f) {
n=0; // 0bbbbbbb
} else if ((c & 0xE0) == 0xC0) {
n=1; // 110bbbbb
} else if ( c==0xed && i<(len-1) && ((unsigned char)str[i+1] & 0xa0)==0xa0) {
return false; //U+d800 to U+dfff
} else if ((c & 0xF0) == 0xE0) {
n=2; // 1110bbbb
} else if ((c & 0xF8) == 0xF0) {
n=3; // 11110bbb
//} else if (($c & 0xFC) == 0xF8) { n=4; // 111110bb //byte 5, unnecessary in 4 byte UTF-8
//} else if (($c & 0xFE) == 0xFC) { n=5; // 1111110b //byte 6, unnecessary in 4 byte UTF-8
} else {
return false;
}

for (int j = 0; j < n && i < len; ++j) { // n bytes matching 10bbbbbb follow ?
if ((++i == len) || (( (unsigned char)str[i] & 0xC0) != 0x80)) {
return false;
}
}
}
return true;
}

HRESULT __cdecl newSegmentCallback( iContext* context, uint32_t n_new, void* user_data ) noexcept
{
ComLight::CComPtr<iTranscribeResult> results;
Expand All @@ -78,12 +109,23 @@ namespace
{
if( params.print_colors )
{
std::string buffer;
for( uint32_t j = 0; j < seg.countTokens; j++ )
{
const sToken& tok = tokens[ seg.firstToken + j ];
if( !params.print_special && ( tok.flags & eTokenFlags::Special ) )
continue;
wprintf( L"%S%s%S", k_colors[ colorIndex( tok ) ], utf16( tok.text ).c_str(), "\033[0m" );
if (utf8_check_is_valid(tok.text, strlen(tok.text))) // not using utf16 test just because I've just used utf8 in another part.
wprintf( L"%S%s%S", k_colors[ colorIndex( tok ) ], utf16( tok.text ).c_str(), "\033[0m" );
else {
for (int k = 0; k < strlen(tok.text); k++) {
buffer.push_back(tok.text[k]);
if (utf8_check_is_valid(&buffer[0], buffer.size())) {
wprintf( L"%S%s%S", k_colors[ colorIndex( tok ) ], utf16( buffer ).c_str(), "\033[0m");
buffer.clear();
}
}
}
}
}
else
Expand Down Expand Up @@ -123,12 +165,23 @@ namespace
to_timestamp( seg.time.end ).c_str(),
speaker.c_str() );

std::string buffer;
for( uint32_t j = 0; j < seg.countTokens; j++ )
{
const sToken& tok = tokens[ seg.firstToken + j ];
if( !params.print_special && ( tok.flags & eTokenFlags::Special ) )
continue;
wprintf( L"%S%s%S", k_colors[ colorIndex( tok ) ], utf16( tok.text ).c_str(), "\033[0m" );
if (utf8_check_is_valid(tok.text, strlen(tok.text))) // not using utf16 test just because I've just used utf8 in another part.
wprintf( L"%S%s%S", k_colors[ colorIndex( tok ) ], utf16( tok.text ).c_str(), "\033[0m" );
else {
for (int k = 0; k < strlen(tok.text); k++) {
buffer.push_back(tok.text[k]);
if (utf8_check_is_valid(&buffer[0], buffer.size())) {
wprintf( L"%S%s%S", k_colors[ colorIndex( tok ) ], utf16( buffer ).c_str(), "\033[0m");
buffer.clear();
}
}
}
}
printf( "\n" );
}
Expand Down