-
Notifications
You must be signed in to change notification settings - Fork 207
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(ndk): add support for reading modified-utf8 sequences from `Byte…
…Buffers`
- Loading branch information
Showing
3 changed files
with
238 additions
and
3 deletions.
There are no files selected for viewing
132 changes: 129 additions & 3 deletions
132
bugsnag-plugin-android-ndk/src/main/java/com/bugsnag/android/ndk/ByteBufferExtensions.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,137 @@ | ||
@file:Suppress("MagicNumber") // this file is filled with numbers used in modified-utf8 | ||
package com.bugsnag.android.ndk | ||
|
||
import java.nio.ByteBuffer | ||
import kotlin.math.min | ||
|
||
private const val UTF_REPLACEMENT_CHAR = '\uFFFD' | ||
|
||
internal fun ByteBuffer.getNativeInt(): Int = getInt() | ||
internal fun ByteBuffer.getNativeLong(): Long = getLong() | ||
|
||
internal fun ByteBuffer.getCString(byteCount: Int): String { | ||
position(position() + byteCount) | ||
return "" | ||
/** | ||
* Decode [allocatedByteCount] as a null-terminated sequence of modified UTF-8 bytes. This reads | ||
* the same format as the JNI `NewUTFStringUTF` function, but also obeys a null-terminator character | ||
* used in C. This function will always consume *exactly* [allocatedByteCount] from this | ||
* `ByteBuffer`, but may return a `String` of fewer (or event zero) characters. This function | ||
* will always return a `String` and invalid UTF-8 sequences will cause the function to return | ||
* what has been successfully decoded up to that point. | ||
*/ | ||
internal fun ByteBuffer.getCString(allocatedByteCount: Int): String { | ||
val origin = position() | ||
val maxBytes = min(allocatedByteCount, remaining()) | ||
|
||
// allocate a CharArray to handle the decoded string | ||
// it can't be longer than the number of bytes in the buffer | ||
val chars = CharArray(maxBytes) | ||
var bytesRead = 0 | ||
var outIndex = 0 | ||
var c = 0 | ||
|
||
// fast path for ASCII-7 compatible characters / strings | ||
while (bytesRead < maxBytes) { | ||
c = get(origin + bytesRead).toInt() and 0xff | ||
// 128+ = we need to take the "slow" path | ||
// 0 = null-terminator - this is the end of the string | ||
if (c >= 128 || c == 0) break | ||
|
||
chars[outIndex++] = c.toChar() | ||
bytesRead++ | ||
} | ||
|
||
// make sure we didn't previously reach the end of the string | ||
if (c != 0) { | ||
outIndex = readModifiedUtf8(bytesRead, maxBytes, origin, chars, outIndex) | ||
} | ||
|
||
// move the ByteBuffer position to after the string | ||
position(origin + maxBytes) | ||
return String(chars, 0, outIndex) | ||
} | ||
|
||
/** | ||
* Read a modified-utf8 string directly from a `ByteBuffer`, this follows the same implementation | ||
* as [java.io.DataInputStream] but also covers an early-exit on null (zero) bytes, staying | ||
* compliant with the C-string format. | ||
* | ||
* @param bytesRead how many bytes have already been read by [getCString] | ||
* @param maxBytes the maximum number of bytes to read for this string | ||
* @param origin the position/index in the ByteBuffer of the first byte for this string, | ||
* this is *not* the first byte to be read by this function | ||
* @param outBuffer the buffer to output the decoded characters into | ||
* @param outIndex the index within [outBuffer] of the first character to decode | ||
* | ||
* @return the length of the string that was decoded | ||
*/ | ||
@Suppress("LoopWithTooManyJumpStatements", "CyclomaticComplexMethod") | ||
private fun ByteBuffer.readModifiedUtf8( | ||
bytesRead: Int, | ||
maxBytes: Int, | ||
origin: Int, | ||
outBuffer: CharArray, | ||
outIndex: Int | ||
): Int { | ||
var bytesRead1 = bytesRead | ||
var c: Int | ||
var outIndex1 = outIndex | ||
while (bytesRead1 < maxBytes) { | ||
c = get(origin + bytesRead1).toInt() and 0xff | ||
if (c == 0) { | ||
// null-terminator - this is the end of the string | ||
break | ||
} | ||
|
||
when (c shr 4) { | ||
0, 1, 2, 3, 4, 5, 6, 7 -> { | ||
/* 0xxxxxxx*/ | ||
bytesRead1++ | ||
outBuffer[outIndex1++] = c.toChar() | ||
} | ||
|
||
12, 13 -> { | ||
/* 110x xxxx 10xx xxxx*/ | ||
bytesRead1 += 2 | ||
if (bytesRead1 > maxBytes) { | ||
// Invalid UTF-8 - but we don't error out, we return what we *do* have | ||
outBuffer[outIndex1++] = UTF_REPLACEMENT_CHAR | ||
break | ||
} | ||
|
||
val char2 = get(origin + bytesRead1 - 1).toInt() and 0xff | ||
if (char2 and 0xc0 != 0x80) { | ||
// Invalid UTF-8 - but we don't error out, we return what we *do* have | ||
outBuffer[outIndex1++] = UTF_REPLACEMENT_CHAR | ||
} | ||
|
||
outBuffer[outIndex1++] = ((c and 0x1f shl 6) or (char2 and 0x3f)).toChar() | ||
} | ||
|
||
14 -> { | ||
/* 1110 xxxx 10xx xxxx 10xx xxxx */ | ||
bytesRead1 += 3 | ||
if (bytesRead1 > maxBytes) { | ||
// Invalid UTF-8 - but we don't error out, we return what we *do* have | ||
outBuffer[outIndex1++] = UTF_REPLACEMENT_CHAR | ||
break | ||
} | ||
|
||
val char2 = get(origin + bytesRead1 - 2).toInt() and 0xff | ||
val char3 = get(origin + bytesRead1 - 1).toInt() and 0xff | ||
if (char2 and 0xc0 != 0x80 || char3 and 0xc0 != 0x80) { | ||
// Invalid UTF-8 - but we don't error out, we return what we *do* have | ||
outBuffer[outIndex1++] = UTF_REPLACEMENT_CHAR | ||
} | ||
|
||
outBuffer[outIndex1++] = | ||
((c and 0x0f shl 12) or (char2 and 0x3f shl 6) or (char3 and 0x3f)).toChar() | ||
} | ||
|
||
else -> { | ||
// Invalid UTF-8 - but we don't error out, we return what we *do* have | ||
outBuffer[outIndex1++] = UTF_REPLACEMENT_CHAR | ||
break | ||
} | ||
} | ||
} | ||
return outIndex1 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
84 changes: 84 additions & 0 deletions
84
bugsnag-plugin-android-ndk/src/test/java/com/bugsnag/android/ndk/CStringDecoderTest.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
package com.bugsnag.android.ndk | ||
|
||
import org.junit.Assert.assertEquals | ||
import org.junit.Test | ||
import java.nio.ByteBuffer | ||
|
||
class CStringDecoderTest { | ||
@Test | ||
fun testAscii7Compatible() { | ||
val buffer = ByteBuffer.wrap( | ||
byteArrayOf( | ||
0x63, 0x6f, 0x6d, 0x2e, 0x65, 0x78, 0x61, 0x6d, | ||
0x70, 0x6c, 0x65, 0x2e, 0x62, 0x75, 0x67, 0x73, | ||
0x6e, 0x61, 0x67, 0x2e, 0x61, 0x6e, 0x64, 0x72, | ||
0x6f, 0x69, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00 | ||
) | ||
) | ||
|
||
assertEquals("com.example.bugsnag.android", buffer.getCString(buffer.remaining())) | ||
} | ||
|
||
@Test | ||
fun testEmptyString() { | ||
val buffer = ByteBuffer.allocate(64) | ||
assertEquals("", buffer.getCString(64)) | ||
assertEquals(0, buffer.remaining()) | ||
} | ||
|
||
@Test | ||
fun testNonAscii7Compatible() { | ||
val buffer = ByteBuffer.wrap(extendedBytes) | ||
assertEquals("はい、これは機械翻訳で書かれています", buffer.getCString(buffer.remaining())) | ||
} | ||
|
||
@Test | ||
fun testInvalidStrings() { | ||
val buffer = ByteBuffer.wrap(extendedBytes) | ||
assertEquals("はい、これは機械翻訳で書かれていま�", buffer.getCString(extendedBytes.indexOf(0) - 1)) | ||
|
||
buffer.rewind() | ||
buffer.put(16, 32) | ||
assertEquals("はい、これ�㠯機械翻訳で書かれていま�", buffer.getCString(extendedBytes.indexOf(0) - 1)) | ||
} | ||
|
||
@Test | ||
fun testGreekStrings() { | ||
val buffer = ByteBuffer.wrap(greekBytes) | ||
assertEquals("ναι, αυτό γράφτηκε με αυτόματη μετάφραση", buffer.getCString(buffer.remaining())) | ||
} | ||
@Test | ||
fun testInvalidGreekStrings() { | ||
val buffer = ByteBuffer.wrap(greekBytes) | ||
assertEquals("ναι, αυτό γράφτηκε με αυτόματη μετάφρασ�", buffer.getCString(greekBytes.indexOf(0) - 1)) | ||
|
||
buffer.rewind() | ||
buffer.put(9, 32) | ||
assertEquals("ναι, �Πυτό γράφτηκε με αυτόματη μετάφρασ�", buffer.getCString(greekBytes.indexOf(0) - 1)) | ||
} | ||
|
||
private val greekBytes = byteArrayOf( | ||
-50, -67, -50, -79, -50, -71, 44, 32, | ||
-50, -79, -49, -123, -49, -124, -49, -116, | ||
32, -50, -77, -49, -127, -50, -84, -49, | ||
-122, -49, -124, -50, -73, -50, -70, -50, | ||
-75, 32, -50, -68, -50, -75, 32, -50, | ||
-79, -49, -123, -49, -124, -49, -116, -50, | ||
-68, -50, -79, -49, -124, -50, -73, 32, | ||
-50, -68, -50, -75, -49, -124, -50, -84, | ||
-49, -122, -49, -127, -50, -79, -49, -125, | ||
-50, -73, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | ||
) | ||
|
||
private val extendedBytes = byteArrayOf( | ||
-29, -127, -81, -29, -127, -124, -29, -128, | ||
-127, -29, -127, -109, -29, -126, -116, -29, | ||
-127, -81, -26, -87, -97, -26, -94, -80, | ||
-25, -65, -69, -24, -88, -77, -29, -127, | ||
-89, -26, -101, -72, -29, -127, -117, -29, | ||
-126, -116, -29, -127, -90, -29, -127, -124, | ||
-29, -127, -66, -29, -127, -103, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | ||
) | ||
} |