Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 89 additions & 66 deletions java/core/src/main/java/com/google/protobuf/Utf8.java
Original file line number Diff line number Diff line change
Expand Up @@ -188,70 +188,13 @@ private static class UnpairedSurrogateException extends Exception {
}

/**
* Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string, this
* method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in both
* time and space.
* Returns the number of bytes in the UTF-8-encoded form of {@code sequence}.
*
* <p>This method is will always return the same value as {@code string.getBytes(UTF_8).length},
* but it may be implemented more efficiently for the current platform.
*/
static int encodedLength(String string) {
// Warning to maintainers: this implementation is highly optimized.
int utf16Length = string.length();
int utf8Length = utf16Length;
int i = 0;

// This loop optimizes for pure ASCII.
while (i < utf16Length && string.charAt(i) < 0x80) {
i++;
}

// This loop optimizes for chars less than 0x800.
for (; i < utf16Length; i++) {
char c = string.charAt(i);
if (c < 0x800) {
utf8Length += ((0x7f - c) >>> 31); // branch free!
} else {
try {
utf8Length += encodedLengthGeneral(string, i);
} catch (UnpairedSurrogateException e) {
// Our hand rolled loops don't handle unpaired surrogates here. This should be
// exceptionally rare, so we fallback to the naive implementation to find out the
// length that the Java internal implementation will return for this string after
// replacement characters.
return string.getBytes(Internal.UTF_8).length;
}
break;
}
}

if (utf8Length < utf16Length) {
// Necessary and sufficient condition for overflow because of maximum 3x expansion
throw new IllegalArgumentException(
"UTF-8 length does not fit in int: " + (utf8Length + (1L << 32)));
}
return utf8Length;
}

private static int encodedLengthGeneral(String string, int start)
throws UnpairedSurrogateException {
int utf16Length = string.length();
int utf8Length = 0;
for (int i = start; i < utf16Length; i++) {
char c = string.charAt(i);
if (c < 0x800) {
utf8Length += (0x7f - c) >>> 31; // branch free!
} else {
utf8Length += 2;
// jdk7+: if (Character.isSurrogate(c)) {
if (Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) {
// Check that we have a well-formed surrogate pair.
int cp = Character.codePointAt(string, i);
if (cp < MIN_SUPPLEMENTARY_CODE_POINT) {
throw new UnpairedSurrogateException(i, utf16Length);
}
i++;
}
}
}
return utf8Length;
return processor.encodedLength(string);
}

static int encode(String in, byte[] out, int offset, int length) {
Expand Down Expand Up @@ -718,6 +661,15 @@ protected void encodeUtf8Naive(String in, ByteBuffer out) {
*/
abstract int encodeUtf8(String in, byte[] out, int offset, int length);

/**
* Returns the number of bytes required to encode the given input character sequence ({@code
* in}) to UTF-8.
*
* @param in the input character sequence to be encoded
* @return the number of bytes required to encode the input character sequence
*/
abstract int encodedLength(String in);

/**
* Encodes an input character sequence ({@code in}) to UTF-8 in the target buffer ({@code out}).
* Upon returning from this method, the {@code out} position will point to the position after
Expand Down Expand Up @@ -978,6 +930,69 @@ int encodeUtf8(String in, byte[] out, int offset, int length) {
return j;
}

@Override
int encodedLength(String string) {
// Warning to maintainers: this implementation is highly optimized.
int utf16Length = string.length();
int utf8Length = utf16Length;
int i = 0;

// This loop optimizes for pure ASCII.
while (i < utf16Length && string.charAt(i) < 0x80) {
i++;
}

// This loop optimizes for chars less than 0x800.
for (; i < utf16Length; i++) {
char c = string.charAt(i);
if (c < 0x800) {
utf8Length += ((0x7f - c) >>> 31); // branch free!
} else {
try {
utf8Length += encodedLengthGeneral(string, i);
} catch (UnpairedSurrogateException e) {
// Our hand rolled loops don't handle unpaired surrogates here. This should be
// exceptionally rare, so we fallback to the naive implementation to find out the
// length that the Java internal implementation will return for this string after
// replacement characters.
return string.getBytes(Internal.UTF_8).length;
}
break;
}
}

if (utf8Length < utf16Length) {
// Necessary and sufficient condition for overflow because of maximum 3x expansion
throw new IllegalArgumentException(
"UTF-8 length does not fit in int: " + (utf8Length + (1L << 32)));
}
return utf8Length;
}

private static int encodedLengthGeneral(String string, int start)
throws UnpairedSurrogateException {
int utf16Length = string.length();
int utf8Length = 0;
for (int i = start; i < utf16Length; i++) {
char c = string.charAt(i);
if (c < 0x800) {
utf8Length += (0x7f - c) >>> 31; // branch free!
} else {
utf8Length += 2;
// jdk7+: if (Character.isSurrogate(c)) {
if (Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) {
// Check that we have a well-formed surrogate pair.
int cp = Character.codePointAt(string, i);
if (cp < MIN_SUPPLEMENTARY_CODE_POINT) {
throw new UnpairedSurrogateException(i, utf16Length);
}
i++;
}
}
}
return utf8Length;
}

@Override
protected void encodeUtf8Internal(String in, ByteBuffer out) {
final int inLength = in.length();
Expand Down Expand Up @@ -1422,15 +1437,23 @@ String decodeUtf8Direct(ByteBuffer buffer, int index, int size)

@Override
int encodeUtf8(String in, byte[] out, int offset, int length) {
// On our servers with strings that have optimized internals, empirically doing the naive
// thing is faster than our best fancy loops.
// On our servers with strings that have optimized internals, doing the naive
// thing has been empirically measured as faster than our best fancy loops (as of 2025).
return encodeUtf8Naive(in, out, offset, length);
}

@Override
int encodedLength(final String in) {
// On our servers with strings that have optimized internals, doing the naive
// thing has been empirically measured as faster than our best fancy loops (as of 2025).
final byte[] bytes = in.getBytes(Internal.UTF_8);
return bytes.length;
}

@Override
protected void encodeUtf8Internal(String in, ByteBuffer out) {
// On our servers with strings that have optimized internals, empirically doing the naive
// thing is faster than our best fancy loops.
// On our servers with strings that have optimized internals, doing the naive
// thing has been empirically measured as faster than our best fancy loops (as of 2025).
encodeUtf8Naive(in, out);
}

Expand Down
Loading