From f5b7e18243b25a39ec53890344f9353a1b54d6cd Mon Sep 17 00:00:00 2001 From: Ben Noordhuis Date: Tue, 3 Mar 2015 01:37:59 +0100 Subject: [PATCH 1/8] src: remove unused code PR-URL: https://github.com/iojs/io.js/pull/1042 Reviewed-By: Trevor Norris --- src/string_bytes.cc | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/string_bytes.cc b/src/string_bytes.cc index 1df6879a6a0346..7b07c6b7da10b4 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -754,21 +754,17 @@ Local StringBytes::Encode(Isolate* isolate, Local StringBytes::Encode(Isolate* isolate, const uint16_t* buf, size_t buflen) { - const uint16_t* src = buf; - Local val; + if (buflen < EXTERN_APEX) { val = String::NewFromTwoByte(isolate, - src, + buf, String::kNormalString, buflen); } else { - val = ExternTwoByteString::NewFromCopy(isolate, src, buflen); + val = ExternTwoByteString::NewFromCopy(isolate, buf, buflen); } - if (src != buf) - delete[] src; - return val; } From 826cde866170918fbeeeffc8612d4f4e0a923869 Mon Sep 17 00:00:00 2001 From: Ben Noordhuis Date: Tue, 3 Mar 2015 01:42:18 +0100 Subject: [PATCH 2/8] src: fix gc heuristic for external twobyte strings Large external two-byte strings reported their character length instead of their byte length, throwing off the garbage collector heuristic by a factor of two. PR-URL: https://github.com/iojs/io.js/pull/1042 Reviewed-By: Trevor Norris --- src/string_bytes.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/string_bytes.cc b/src/string_bytes.cc index 7b07c6b7da10b4..c828363da6d29c 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -28,8 +28,7 @@ class ExternString: public ResourceType { public: ~ExternString() override { delete[] data_; - int64_t change_in_bytes = -static_cast(length_); - isolate()->AdjustAmountOfExternalAllocatedMemory(change_in_bytes); + isolate()->AdjustAmountOfExternalAllocatedMemory(-byte_length()); } const TypeName* data() const override { @@ -40,6 +39,10 @@ class ExternString: public ResourceType { return length_; } + int64_t byte_length() const { + return length() * sizeof(*data()); + } + static Local NewFromCopy(Isolate* isolate, const TypeName* data, size_t length) { @@ -69,7 +72,7 @@ class ExternString: public ResourceType { data, length); Local str = String::NewExternal(isolate, h_str); - isolate->AdjustAmountOfExternalAllocatedMemory(length); + isolate->AdjustAmountOfExternalAllocatedMemory(h_str->byte_length()); return scope.Escape(str); } From 364cc7e08a5d4b74891362f837ff9c34844c3d35 Mon Sep 17 00:00:00 2001 From: Ben Noordhuis Date: Tue, 3 Mar 2015 15:09:58 +0100 Subject: [PATCH 3/8] src: remove NODE_INVALID_UTF8 environment variable Introduced in joyent/node v0.10 as a backwards compatibility measure. It's an ugly hack and allowing invalid UTF-8 is not a good idea in the first place, remove it. PR-URL: https://github.com/iojs/io.js/pull/1042 Reviewed-By: Trevor Norris --- src/node.cc | 8 -------- src/string_bytes.cc | 7 ++++--- src/string_bytes.h | 2 -- src/util.cc | 9 +++------ 4 files changed, 7 insertions(+), 19 deletions(-) diff --git a/src/node.cc b/src/node.cc index 1bee586033515c..22c4a9b1547e60 100644 --- a/src/node.cc +++ b/src/node.cc @@ -139,9 +139,6 @@ static uv_async_t dispatch_debug_messages_async; static Isolate* node_isolate = nullptr; -int WRITE_UTF8_FLAGS = v8::String::HINT_MANY_WRITES_EXPECTED | - v8::String::NO_NULL_TERMINATION; - class ArrayBufferAllocator : public ArrayBuffer::Allocator { public: // Impose an upper limit to avoid out of memory errors that bring down @@ -3819,11 +3816,6 @@ static void StartNodeInstance(void* arg) { int Start(int argc, char** argv) { PlatformInit(); - const char* replace_invalid = secure_getenv("NODE_INVALID_UTF8"); - - if (replace_invalid == nullptr) - WRITE_UTF8_FLAGS |= String::REPLACE_INVALID_UTF8; - CHECK_GT(argc, 0); // Hack around with the argv pointer. Used for process.title = "blah". diff --git a/src/string_bytes.cc b/src/string_bytes.cc index c828363da6d29c..4a91f25048e5c1 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -287,8 +287,9 @@ size_t StringBytes::Write(Isolate* isolate, Local str = val.As(); len = len < buflen ? len : buflen; - int flags = String::NO_NULL_TERMINATION | - String::HINT_MANY_WRITES_EXPECTED; + int flags = String::HINT_MANY_WRITES_EXPECTED | + String::NO_NULL_TERMINATION | + String::REPLACE_INVALID_UTF8; switch (encoding) { case ASCII: @@ -311,7 +312,7 @@ size_t StringBytes::Write(Isolate* isolate, // well? memcpy(buf, data, len); else - len = str->WriteUtf8(buf, buflen, chars_written, WRITE_UTF8_FLAGS); + len = str->WriteUtf8(buf, buflen, chars_written, flags); break; case UCS2: diff --git a/src/string_bytes.h b/src/string_bytes.h index 424d9245aad4dc..2fcfedaa098b67 100644 --- a/src/string_bytes.h +++ b/src/string_bytes.h @@ -10,8 +10,6 @@ namespace node { -extern int WRITE_UTF8_FLAGS; - class StringBytes { public: class InlineDecoder { diff --git a/src/util.cc b/src/util.cc index 1c57a976e1164f..1e8d801dbab9e5 100644 --- a/src/util.cc +++ b/src/util.cc @@ -23,12 +23,9 @@ Utf8Value::Utf8Value(v8::Isolate* isolate, v8::Handle value) str = str_st_; CHECK_NE(str, NULL); - int flags = WRITE_UTF8_FLAGS; - - length_ = val_->WriteUtf8(str, - len, - 0, - flags); + const int flags = + v8::String::NO_NULL_TERMINATION | v8::String::REPLACE_INVALID_UTF8; + length_ = val_->WriteUtf8(str, len, 0, flags); str[length_] = '\0'; str_ = reinterpret_cast(str); From c9ee654290d4b5f99855e06ff64a1cb8ed3e50b4 Mon Sep 17 00:00:00 2001 From: Ben Noordhuis Date: Tue, 3 Mar 2015 15:16:07 +0100 Subject: [PATCH 4/8] src: simplify node::Utf8Value() * Remove kStorageSize constant. * Remove superfluous local variable and reinterpret_cast. * Reorder data members so the length field and data pointer (if not the data itself) fit in a single cache line. PR-URL: https://github.com/iojs/io.js/pull/1042 Reviewed-By: Trevor Norris --- src/util.cc | 21 ++++++++------------- src/util.h | 3 +-- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/src/util.cc b/src/util.cc index 1e8d801dbab9e5..a0bd077a93aaf9 100644 --- a/src/util.cc +++ b/src/util.cc @@ -1,11 +1,10 @@ #include "util.h" - #include "string_bytes.h" namespace node { Utf8Value::Utf8Value(v8::Isolate* isolate, v8::Handle value) - : length_(0), str_(nullptr) { + : length_(0), str_(str_st_) { if (value.IsEmpty()) return; @@ -15,19 +14,15 @@ Utf8Value::Utf8Value(v8::Isolate* isolate, v8::Handle value) // Allocate enough space to include the null terminator size_t len = StringBytes::StorageSize(val_, UTF8) + 1; - - char* str; - if (len > kStorageSize) - str = static_cast(malloc(len)); - else - str = str_st_; - CHECK_NE(str, NULL); + if (len > sizeof(str_st_)) { + str_ = static_cast(malloc(len)); + CHECK_NE(str_, nullptr); + } const int flags = v8::String::NO_NULL_TERMINATION | v8::String::REPLACE_INVALID_UTF8; - length_ = val_->WriteUtf8(str, len, 0, flags); - str[length_] = '\0'; - - str_ = reinterpret_cast(str); + length_ = val_->WriteUtf8(str_, len, 0, flags); + str_[length_] = '\0'; } + } // namespace node diff --git a/src/util.h b/src/util.h index 5742252688111b..ea17a155745993 100644 --- a/src/util.h +++ b/src/util.h @@ -190,10 +190,9 @@ class Utf8Value { }; private: - static const int kStorageSize = 1024; size_t length_; - char str_st_[kStorageSize]; char* str_; + char str_st_[1024]; }; } // namespace node From 4aea16f21419a2784d2af520eed30d80b75a1607 Mon Sep 17 00:00:00 2001 From: Ben Noordhuis Date: Tue, 3 Mar 2015 15:20:54 +0100 Subject: [PATCH 5/8] src: rename confusingly named local variable Rename `val_` to `string`. The underscore suffix is normally reserved for data members, not locals, and it's not a great name in the first place. PR-URL: https://github.com/iojs/io.js/pull/1042 Reviewed-By: Trevor Norris --- src/util.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/util.cc b/src/util.cc index a0bd077a93aaf9..f382b3d565a8cf 100644 --- a/src/util.cc +++ b/src/util.cc @@ -8,12 +8,12 @@ Utf8Value::Utf8Value(v8::Isolate* isolate, v8::Handle value) if (value.IsEmpty()) return; - v8::Local val_ = value->ToString(isolate); - if (val_.IsEmpty()) + v8::Local string = value->ToString(isolate); + if (string.IsEmpty()) return; // Allocate enough space to include the null terminator - size_t len = StringBytes::StorageSize(val_, UTF8) + 1; + size_t len = StringBytes::StorageSize(string, UTF8) + 1; if (len > sizeof(str_st_)) { str_ = static_cast(malloc(len)); CHECK_NE(str_, nullptr); @@ -21,7 +21,7 @@ Utf8Value::Utf8Value(v8::Isolate* isolate, v8::Handle value) const int flags = v8::String::NO_NULL_TERMINATION | v8::String::REPLACE_INVALID_UTF8; - length_ = val_->WriteUtf8(str_, len, 0, flags); + length_ = string->WriteUtf8(str_, len, 0, flags); str_[length_] = '\0'; } From e2fb733a951a51821c03cf83400607522da52e44 Mon Sep 17 00:00:00 2001 From: Ben Noordhuis Date: Tue, 3 Mar 2015 16:07:26 +0100 Subject: [PATCH 6/8] test: simplify parallel/test-stringbytes-external Make the algorithm that creates the big input strings a little easier to comprehend. No functional changes, the string lengths are unchanged. PR-URL: https://github.com/iojs/io.js/pull/1042 Reviewed-By: Trevor Norris --- test/parallel/test-stringbytes-external.js | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/test/parallel/test-stringbytes-external.js b/test/parallel/test-stringbytes-external.js index 331897286ad165..e4e3f366a1bb59 100644 --- a/test/parallel/test-stringbytes-external.js +++ b/test/parallel/test-stringbytes-external.js @@ -15,15 +15,10 @@ assert.equal(b[0], 0x61); assert.equal(b[1], 0); assert.equal(ucs2_control, c); - -// grow the strings to proper length -while (write_str.length <= EXTERN_APEX) { - write_str += write_str; - ucs2_control += ucs2_control; -} -write_str += write_str.substr(0, EXTERN_APEX - write_str.length); -ucs2_control += ucs2_control.substr(0, EXTERN_APEX * 2 - ucs2_control.length); - +// now create big strings +var size = 1 + (1 << 20); +write_str = Array(size).join(write_str); +ucs2_control = Array(size).join(ucs2_control); // check resultant buffer and output string var b = new Buffer(write_str, 'ucs2'); From 2eda2d609658826c559fca1944b0e6aafb9d1344 Mon Sep 17 00:00:00 2001 From: Ben Noordhuis Date: Tue, 3 Mar 2015 14:46:16 +0100 Subject: [PATCH 7/8] src: fix external string length calculation Make StringBytes::GetExternalParts() return the byte length for two-byte strings, not the character length. Its callers operate on bytes, not characters. This also fixes StringBytes::Size() reporting only half of the actual number of bytes for external two-byte strings. PR-URL: https://github.com/iojs/io.js/pull/1042 Reviewed-By: Trevor Norris --- src/string_bytes.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/string_bytes.cc b/src/string_bytes.cc index 4a91f25048e5c1..1f5e592a32569b 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -263,7 +263,7 @@ bool StringBytes::GetExternalParts(Isolate* isolate, const String::ExternalStringResource* ext; ext = str->GetExternalStringResource(); *data = reinterpret_cast(ext->data()); - *len = ext->length(); + *len = ext->length() * sizeof(*ext->data()); return true; } @@ -317,7 +317,7 @@ size_t StringBytes::Write(Isolate* isolate, case UCS2: if (is_extern) - memcpy(buf, data, len * 2); + memcpy(buf, data, len); else len = str->Write(reinterpret_cast(buf), 0, buflen, flags); if (IsBigEndian()) { From 1640dedb3b2a8d6e54ba7b22290d86d5984768be Mon Sep 17 00:00:00 2001 From: Ben Noordhuis Date: Tue, 3 Mar 2015 15:44:54 +0100 Subject: [PATCH 8/8] src: fix ucs-2 buffer encoding regression StringBytes::Write() did a plain memcpy() when is_extern is true but that's wrong when the source is a two-byte string and the destination a one-byte or UTF-8 string. The impact is limited to strings > 1,031,913 bytes because those are normally the only strings that are externalized, although the use of the 'externalize strings' extension (--expose_externalize_string) can also trigger it. This commit also cleans up the bytes versus characters confusion in StringBytes::Write() because that was closely intertwined with the UCS-2 encoding regression. One wasn't fixable without the other. Fixes: https://github.com/iojs/io.js/issues/1024 Fixes: https://github.com/joyent/node/issues/8683 PR-URL: https://github.com/iojs/io.js/pull/1042 Reviewed-By: Trevor Norris --- src/node_buffer.cc | 3 - src/string_bytes.cc | 72 +++++++++++----------- test/parallel/test-stringbytes-external.js | 13 ++++ 3 files changed, 49 insertions(+), 39 deletions(-) diff --git a/src/node_buffer.cc b/src/node_buffer.cc index 2d00ca97cc0f58..fa77c0779762a6 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -415,9 +415,6 @@ void StringWrite(const FunctionCallbackInfo& args) { if (max_length == 0) return args.GetReturnValue().Set(0); - if (encoding == UCS2) - max_length = max_length / 2; - if (offset >= obj_length) return env->ThrowRangeError("Offset is out of bounds"); diff --git a/src/string_bytes.cc b/src/string_bytes.cc index 1f5e592a32569b..4f896ace3fb693 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -279,13 +279,15 @@ size_t StringBytes::Write(Isolate* isolate, int* chars_written) { HandleScope scope(isolate); const char* data = nullptr; - size_t len = 0; - bool is_extern = GetExternalParts(isolate, val, &data, &len); - size_t extlen = len; + size_t nbytes = 0; + const bool is_extern = GetExternalParts(isolate, val, &data, &nbytes); + const size_t external_nbytes = nbytes; CHECK(val->IsString() == true); Local str = val.As(); - len = len < buflen ? len : buflen; + + if (nbytes > buflen) + nbytes = buflen; int flags = String::HINT_MANY_WRITES_EXPECTED | String::NO_NULL_TERMINATION | @@ -295,67 +297,65 @@ size_t StringBytes::Write(Isolate* isolate, case ASCII: case BINARY: case BUFFER: - if (is_extern) - memcpy(buf, data, len); - else - len = str->WriteOneByte(reinterpret_cast(buf), - 0, - buflen, - flags); + if (is_extern && str->IsOneByte()) { + memcpy(buf, data, nbytes); + } else { + uint8_t* const dst = reinterpret_cast(buf); + nbytes = str->WriteOneByte(dst, 0, buflen, flags); + } if (chars_written != nullptr) - *chars_written = len; + *chars_written = nbytes; break; case UTF8: - if (is_extern) - // TODO(tjfontaine) should this validate invalid surrogate pairs as - // well? - memcpy(buf, data, len); - else - len = str->WriteUtf8(buf, buflen, chars_written, flags); + nbytes = str->WriteUtf8(buf, buflen, chars_written, flags); break; - case UCS2: - if (is_extern) - memcpy(buf, data, len); - else - len = str->Write(reinterpret_cast(buf), 0, buflen, flags); + case UCS2: { + uint16_t* const dst = reinterpret_cast(buf); + size_t nchars; + if (is_extern && !str->IsOneByte()) { + memcpy(buf, data, nbytes); + nchars = nbytes / sizeof(*dst); + } else { + nchars = buflen / sizeof(*dst); + nchars = str->Write(dst, 0, nchars, flags); + nbytes = nchars * sizeof(*dst); + } if (IsBigEndian()) { // Node's "ucs2" encoding wants LE character data stored in // the Buffer, so we need to reorder on BE platforms. See // http://nodejs.org/api/buffer.html regarding Node's "ucs2" // encoding specification - uint16_t* buf16 = reinterpret_cast(buf); - for (size_t i = 0; i < len; i++) { - buf16[i] = (buf16[i] << 8) | (buf16[i] >> 8); - } + for (size_t i = 0; i < nchars; i++) + dst[i] = dst[i] << 8 | dst[i] >> 8; } if (chars_written != nullptr) - *chars_written = len; - len = len * sizeof(uint16_t); + *chars_written = nchars; break; + } case BASE64: if (is_extern) { - len = base64_decode(buf, buflen, data, extlen); + nbytes = base64_decode(buf, buflen, data, external_nbytes); } else { String::Value value(str); - len = base64_decode(buf, buflen, *value, value.length()); + nbytes = base64_decode(buf, buflen, *value, value.length()); } if (chars_written != nullptr) { - *chars_written = len; + *chars_written = nbytes; } break; case HEX: if (is_extern) { - len = hex_decode(buf, buflen, data, extlen); + nbytes = hex_decode(buf, buflen, data, external_nbytes); } else { String::Value value(str); - len = hex_decode(buf, buflen, *value, value.length()); + nbytes = hex_decode(buf, buflen, *value, value.length()); } if (chars_written != nullptr) { - *chars_written = len * 2; + *chars_written = nbytes; } break; @@ -364,7 +364,7 @@ size_t StringBytes::Write(Isolate* isolate, break; } - return len; + return nbytes; } diff --git a/test/parallel/test-stringbytes-external.js b/test/parallel/test-stringbytes-external.js index e4e3f366a1bb59..5bc4c945e87705 100644 --- a/test/parallel/test-stringbytes-external.js +++ b/test/parallel/test-stringbytes-external.js @@ -106,3 +106,16 @@ var PRE_3OF4_APEX = Math.ceil((EXTERN_APEX / 4) * 3) - RADIOS; } } })(); + +// https://github.com/iojs/io.js/issues/1024 +(function() { + var a = Array(1 << 20).join('x'); + var b = Buffer(a, 'ucs2').toString('ucs2'); + var c = Buffer(b, 'utf8').toString('utf8'); + + assert.equal(a.length, b.length); + assert.equal(b.length, c.length); + + assert.equal(a, b); + assert.equal(b, c); +})();