forked from diffblue/cbmc
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request diffblue#2543 from tautschnig/vs-unsigned-byte-swap
Explicit unsigned -> uint16_t casts to avoid conversion warnings
- Loading branch information
Showing
12 changed files
with
73 additions
and
108 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,16 +18,6 @@ Author: Daniel Kroening, [email protected] | |
#include <windows.h> | ||
#endif | ||
|
||
/// Determine endianness of the architecture | ||
/// \return True if the architecture is little_endian | ||
bool is_little_endian_arch() | ||
{ | ||
uint32_t i=1; | ||
return reinterpret_cast<uint8_t &>(i) != 0; | ||
} | ||
|
||
#define BUFSIZE 100 | ||
|
||
std::string narrow(const wchar_t *s) | ||
{ | ||
#ifdef _WIN32 | ||
|
@@ -138,9 +128,10 @@ static void utf8_append_code(unsigned int c, std::string &result) | |
} | ||
} | ||
|
||
/// \param utf32:encoded wide string | ||
/// \param s UTF-32 encoded wide string | ||
/// \return utf8-encoded string with the same unicode characters as the input. | ||
std::string utf32_to_utf8(const std::basic_string<unsigned int> &s) | ||
std::string | ||
utf32_native_endian_to_utf8(const std::basic_string<unsigned int> &s) | ||
{ | ||
std::string result; | ||
|
||
|
@@ -166,51 +157,37 @@ std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide) | |
return argv_narrow; | ||
} | ||
|
||
/// A helper function for dealing with different UTF16 endians | ||
/// \par parameters: A 16-bit integer | ||
/// \return A 16-bit integer with bytes swapped | ||
uint16_t do_swap_bytes(uint16_t x) | ||
{ | ||
uint16_t b1=x & 0xFF; | ||
uint16_t b2=x & 0xFF00; | ||
return (b1 << 8) | (b2 >> 8); | ||
} | ||
|
||
|
||
void utf16_append_code(unsigned int code, bool swap_bytes, std::wstring &result) | ||
static void utf16_append_code(unsigned int code, std::wstring &result) | ||
{ | ||
// we do not treat 0xD800 to 0xDFFF, although | ||
// they are not valid unicode symbols | ||
|
||
if(code<0xFFFF) | ||
{ // code is encoded as one UTF16 character | ||
// we just take the code and possibly swap the bytes | ||
unsigned int a=(swap_bytes)?do_swap_bytes(code):code; | ||
result+=static_cast<wchar_t>(a); | ||
{ | ||
// code is encoded as one UTF16 character | ||
result += static_cast<wchar_t>(code); | ||
} | ||
else // code is encoded as two UTF16 characters | ||
{ | ||
// if this is valid unicode, we have | ||
// code<0x10FFFF | ||
// but let's not check it programmatically | ||
|
||
// encode the code in UTF16, possibly swapping bytes. | ||
// encode the code in UTF16 | ||
code=code-0x10000; | ||
unsigned int i1=((code>>10) & 0x3ff) | 0xD800; | ||
unsigned int a1=(swap_bytes)?do_swap_bytes(static_cast<uint16_t>(i1)):i1; | ||
result+=static_cast<wchar_t>(a1); | ||
unsigned int i2=(code & 0x3ff) | 0xDC00; | ||
unsigned int a2=(swap_bytes)?do_swap_bytes(static_cast<uint16_t>(i2)):i2; | ||
result+=static_cast<wchar_t>(a2); | ||
const uint16_t i1 = static_cast<uint16_t>(((code >> 10) & 0x3ff) | 0xD800); | ||
result += static_cast<wchar_t>(i1); | ||
const uint16_t i2 = static_cast<uint16_t>((code & 0x3ff) | 0xDC00); | ||
result += static_cast<wchar_t>(i2); | ||
} | ||
} | ||
|
||
|
||
/// \par parameters: String in UTF-8 format, bool value indicating whether the | ||
/// endianness should be different from the architecture one. | ||
/// Convert UTF8-encoded string to UTF-16 with architecture-native endianness. | ||
/// \par parameters: String in UTF-8 format | ||
/// \return String in UTF-16 format. The encoding follows the endianness of the | ||
/// architecture iff swap_bytes is true. | ||
std::wstring utf8_to_utf16(const std::string& in, bool swap_bytes) | ||
std::wstring utf8_to_utf16_native_endian(const std::string &in) | ||
{ | ||
std::wstring result; | ||
result.reserve(in.size()); | ||
|
@@ -263,33 +240,17 @@ std::wstring utf8_to_utf16(const std::string& in, bool swap_bytes) | |
code=32; | ||
} | ||
|
||
utf16_append_code(code, swap_bytes, result); | ||
utf16_append_code(code, result); | ||
} | ||
|
||
return result; | ||
} | ||
|
||
/// \par parameters: String in UTF-8 format | ||
/// \return String in UTF-16BE format | ||
std::wstring utf8_to_utf16_big_endian(const std::string &in) | ||
{ | ||
bool swap_bytes=is_little_endian_arch(); | ||
return utf8_to_utf16(in, swap_bytes); | ||
} | ||
|
||
/// \par parameters: String in UTF-8 format | ||
/// \return String in UTF-16LE format | ||
std::wstring utf8_to_utf16_little_endian(const std::string &in) | ||
{ | ||
bool swap_bytes=!is_little_endian_arch(); | ||
return utf8_to_utf16(in, swap_bytes); | ||
} | ||
|
||
/// \param ch: UTF-16LE character | ||
/// \param ch: UTF-16 character in architecture-native endianness encoding | ||
/// \param result: stream to receive string in US-ASCII format, with \\uxxxx | ||
/// escapes for other characters | ||
/// \param loc: locale to check for printable characters | ||
static void utf16_little_endian_to_java( | ||
static void utf16_native_endian_to_java( | ||
const wchar_t ch, | ||
std::ostringstream &result, | ||
const std::locale &loc) | ||
|
@@ -326,23 +287,23 @@ static void utf16_little_endian_to_java( | |
} | ||
} | ||
|
||
/// \param ch: UTF-16LE character | ||
/// \param ch: UTF-16 character in architecture-native endianness encoding | ||
/// \return String in US-ASCII format, with \\uxxxx escapes for other characters | ||
std::string utf16_little_endian_to_java(const wchar_t ch) | ||
std::string utf16_native_endian_to_java(const wchar_t ch) | ||
{ | ||
std::ostringstream result; | ||
const std::locale loc; | ||
utf16_little_endian_to_java(ch, result, loc); | ||
utf16_native_endian_to_java(ch, result, loc); | ||
return result.str(); | ||
} | ||
|
||
/// \param in: String in UTF-16LE format | ||
/// \param in: String in UTF-16 (native endianness) format | ||
/// \return String in US-ASCII format, with \\uxxxx escapes for other characters | ||
std::string utf16_little_endian_to_java(const std::wstring &in) | ||
std::string utf16_native_endian_to_java(const std::wstring &in) | ||
{ | ||
std::ostringstream result; | ||
const std::locale loc; | ||
for(const auto ch : in) | ||
utf16_little_endian_to_java(ch, result, loc); | ||
utf16_native_endian_to_java(ch, result, loc); | ||
return result.str(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,20 +6,22 @@ Author: Vojtech Forejt, [email protected] | |
\*******************************************************************/ | ||
|
||
#include <cassert> | ||
#include <testing-utils/catch.hpp> | ||
|
||
#include <vector> | ||
#include <string> | ||
#include <codecvt> | ||
#include <iomanip> | ||
#include <iostream> | ||
#include <locale> | ||
|
||
#include <util/unicode.h> | ||
|
||
// the u8 prefix is only available from VS 2015 onwards | ||
#if !defined(_MSC_VER) || _MSC_VER >= 1900 | ||
|
||
// This unit test compares our implementation with codecvt implementation, | ||
// checking bit-by-bit equivalence of results. | ||
|
||
bool paranoid_wstr_equals(const std::wstring &a, const std::wstring &b) | ||
static bool paranoid_wstr_equals(const std::wstring &a, const std::wstring &b) | ||
{ | ||
if(a.size() != b.size()) | ||
return false; | ||
|
@@ -35,7 +37,10 @@ bool paranoid_wstr_equals(const std::wstring &a, const std::wstring &b) | |
} | ||
|
||
// helper print function, can be called for debugging problem | ||
void wstr_print(const std::wstring &a, const std::wstring &b) | ||
#if 0 | ||
#include <iostream> | ||
|
||
static void wstr_print(const std::wstring &a, const std::wstring &b) | ||
{ | ||
int endi=(a.size()>b.size())?a.size():b.size(); | ||
const unsigned char | ||
|
@@ -49,46 +54,46 @@ void wstr_print(const std::wstring &a, const std::wstring &b) | |
} | ||
std::cout << '\n'; | ||
} | ||
#endif | ||
|
||
void compare_utf8_to_utf16_big_endian(std::string& in) | ||
static bool compare_utf8_to_utf16(const std::string &in) | ||
{ | ||
std::wstring s1=utf8_to_utf16_big_endian(in); | ||
const std::wstring s1 = utf8_to_utf16_native_endian(in); | ||
|
||
typedef std::codecvt_utf8_utf16<wchar_t> codecvt_utf8_utf16t; | ||
std::wstring_convert<codecvt_utf8_utf16t> converter; | ||
std::wstring s2=converter.from_bytes(in); | ||
|
||
assert(paranoid_wstr_equals(s1, s2)); | ||
return paranoid_wstr_equals(s1, s2); | ||
} | ||
|
||
void compare_utf8_to_utf16_little_endian(std::string& in) | ||
TEST_CASE("unicode0", "[core][util][unicode]") | ||
{ | ||
std::wstring s1=utf8_to_utf16_little_endian(in); | ||
|
||
const std::codecvt_mode mode=std::codecvt_mode::little_endian; | ||
const unsigned long maxcode=0x10ffff; | ||
const std::string s = u8"abc"; | ||
REQUIRE(compare_utf8_to_utf16(s)); | ||
} | ||
|
||
typedef std::codecvt_utf8_utf16<wchar_t, maxcode, mode> codecvt_utf8_utf16t; | ||
std::wstring_convert<codecvt_utf8_utf16t> converter; | ||
std::wstring s2=converter.from_bytes(in); | ||
TEST_CASE("unicode1", "[core][util][unicode]") | ||
{ | ||
const std::string s = u8"\u0070\u00DF\u00E0\u00EF\u00F0\u00F7\u00F8"; | ||
REQUIRE(compare_utf8_to_utf16(s)); | ||
} | ||
|
||
assert(paranoid_wstr_equals(s1, s2)); | ||
TEST_CASE("unicode2", "[core][util][unicode]") | ||
{ | ||
const std::string s = u8"$¢€𐍈"; | ||
REQUIRE(compare_utf8_to_utf16(s)); | ||
} | ||
|
||
int main() | ||
TEST_CASE("unicode3", "[core][util][unicode]") | ||
{ | ||
std::string s; | ||
s=u8"\u0070\u00DF\u00E0\u00EF\u00F0\u00F7\u00F8"; | ||
compare_utf8_to_utf16_big_endian(s); | ||
compare_utf8_to_utf16_little_endian(s); | ||
s=u8"$¢€𐍈"; | ||
compare_utf8_to_utf16_big_endian(s); | ||
compare_utf8_to_utf16_little_endian(s); | ||
s=u8"𐐏𤭢"; | ||
compare_utf8_to_utf16_big_endian(s); | ||
compare_utf8_to_utf16_little_endian(s); | ||
s=u8"дȚȨɌṡʒʸͼἨѶݔݺ→⅒⅀▤▞╢◍⛳⻥龍ンㄗㄸ"; | ||
compare_utf8_to_utf16_big_endian(s); | ||
compare_utf8_to_utf16_little_endian(s); | ||
const std::string s = u8"𐐏𤭢"; | ||
REQUIRE(compare_utf8_to_utf16(s)); | ||
} | ||
|
||
TEST_CASE("unicode4", "[core][util][unicode]") | ||
{ | ||
const std::string s = u8"дȚȨɌṡʒʸͼἨѶݔݺ→⅒⅀▤▞╢◍⛳⻥龍ンㄗㄸ"; | ||
REQUIRE(compare_utf8_to_utf16(s)); | ||
} | ||
#endif |