From 719ab84318ae1c7c59da5657ef0ad41dc4c3f921 Mon Sep 17 00:00:00 2001 From: Bryan Duxbury Date: Sun, 25 Apr 2010 15:20:02 +0000 Subject: [PATCH] THRIFT-765. java: Improved string encoding and decoding performance This change makes Java's string/utf8 encoding and decoding about 2x faster. git-svn-id: https://svn.apache.org/repos/asf/incubator/thrift/trunk@937812 13f79535-47bb-0310-9956-ffa450edef68 --- .../src/org/apache/thrift/Utf8Helper.java | 86 +++++++++++++++++++ .../thrift/protocol/TBinaryProtocol.java | 35 +++----- .../thrift/protocol/TCompactProtocol.java | 25 ++---- .../org/apache/thrift/TestUtf8Helper.java | 58 +++++++++++++ 4 files changed, 163 insertions(+), 41 deletions(-) create mode 100644 lib/java/src/org/apache/thrift/Utf8Helper.java create mode 100644 lib/java/test/org/apache/thrift/TestUtf8Helper.java diff --git a/lib/java/src/org/apache/thrift/Utf8Helper.java b/lib/java/src/org/apache/thrift/Utf8Helper.java new file mode 100644 index 00000000..e754517d --- /dev/null +++ b/lib/java/src/org/apache/thrift/Utf8Helper.java @@ -0,0 +1,86 @@ +package org.apache.thrift; + +public final class Utf8Helper { + private Utf8Helper() {} + + public static final int getByteLength(final String s) { + int byteLength = 0; + int c; + for (int i = 0; i < s.length(); i++) { + c = s.charAt(i); + if (c <= 0x007F) { + byteLength++; + } else if (c > 0x07FF) { + byteLength+=3; + } else { + byteLength+=2; + } + } + return byteLength; + } + + public static byte[] encode(String s) { + byte[] buf = new byte[getByteLength(s)]; + encode(s, buf, 0); + return buf; + } + + public static void encode(String s, byte[] buf, int offset) { + int nextByte = 0; + int c; + for (int i = 0; i < s.length(); i++) { + c = s.charAt(i); + if (c <= 0x007F) { + buf[offset + nextByte] = (byte)c; + nextByte++; + } else if (c > 0x07FF) { + buf[offset + nextByte ] = (byte)(0xE0 | c >> 12 & 0x0F); + buf[offset + nextByte + 1] = (byte)(0x80 | c >> 6 & 0x3F); + buf[offset + nextByte + 2] = (byte)(0x80 | c & 0x3F); + nextByte+=3; + } else { + buf[offset + nextByte ] = (byte)(0xC0 | c >> 6 & 0x1F); + buf[offset + nextByte + 1] = (byte)(0x80 | c & 0x3F); + nextByte+=2; + } + } + } + + public static String decode(byte[] buf) { + return decode(buf, 0, buf.length); + } + + public static String decode(byte[] buf, int offset, int byteLength) { + int charCount = 0; + char[] chars = new char[byteLength]; + int c; + int byteIndex = offset; + int charIndex = 0; + while (byteIndex < offset + byteLength) { + c = buf[byteIndex++] & 0xFF; + switch (c >> 4) { + case 0: + case 1: + case 2: + case 3: + case 4: + case 5: + case 6: + case 7: + chars[charIndex++] = (char) c; + break; + case 12: + case 13: + chars[charIndex++] = (char) ((c & 0x1F) << 6 | (buf[byteIndex++] & 0x3F)); + break; + case 14: + chars[charIndex++] = (char) ((c & 0x0F) << 12 | (buf[byteIndex++] & 0x3F) << 6 | (buf[byteIndex++] & 0x3F) << 0); + break; + } + charCount++; + } + return new String(chars, 0, charCount); + + } + +} diff --git a/lib/java/src/org/apache/thrift/protocol/TBinaryProtocol.java b/lib/java/src/org/apache/thrift/protocol/TBinaryProtocol.java index 16c7567c..3b4453dc 100644 --- a/lib/java/src/org/apache/thrift/protocol/TBinaryProtocol.java +++ b/lib/java/src/org/apache/thrift/protocol/TBinaryProtocol.java @@ -19,9 +19,8 @@ package org.apache.thrift.protocol; -import java.io.UnsupportedEncodingException; - import org.apache.thrift.TException; +import org.apache.thrift.Utf8Helper; import org.apache.thrift.transport.TTransport; /** @@ -170,13 +169,9 @@ public class TBinaryProtocol extends TProtocol { } public void writeString(String str) throws TException { - try { - byte[] dat = str.getBytes("UTF-8"); - writeI32(dat.length); - trans_.write(dat, 0, dat.length); - } catch (UnsupportedEncodingException uex) { - throw new TException("JVM DOES NOT SUPPORT UTF-8"); - } + byte[] dat = Utf8Helper.encode(str); + writeI32(dat.length); + trans_.write(dat, 0, dat.length); } public void writeBinary(byte[] bin) throws TException { @@ -323,27 +318,19 @@ public class TBinaryProtocol extends TProtocol { int size = readI32(); if (trans_.getBytesRemainingInBuffer() >= size) { - try { - String s = new String(trans_.getBuffer(), trans_.getBufferPosition(), size, "UTF-8"); - trans_.consumeBuffer(size); - return s; - } catch (UnsupportedEncodingException e) { - throw new TException("JVM DOES NOT SUPPORT UTF-8"); - } + String s = Utf8Helper.decode(trans_.getBuffer(), trans_.getBufferPosition(), size); + trans_.consumeBuffer(size); + return s; } return readStringBody(size); } public String readStringBody(int size) throws TException { - try { - checkReadLength(size); - byte[] buf = new byte[size]; - trans_.readAll(buf, 0, size); - return new String(buf, "UTF-8"); - } catch (UnsupportedEncodingException uex) { - throw new TException("JVM DOES NOT SUPPORT UTF-8"); - } + checkReadLength(size); + byte[] buf = new byte[size]; + trans_.readAll(buf, 0, size); + return Utf8Helper.decode(buf); } public byte[] readBinary() throws TException { diff --git a/lib/java/src/org/apache/thrift/protocol/TCompactProtocol.java b/lib/java/src/org/apache/thrift/protocol/TCompactProtocol.java index f4979423..f50ef1b0 100755 --- a/lib/java/src/org/apache/thrift/protocol/TCompactProtocol.java +++ b/lib/java/src/org/apache/thrift/protocol/TCompactProtocol.java @@ -20,10 +20,9 @@ package org.apache.thrift.protocol; -import java.io.UnsupportedEncodingException; - import org.apache.thrift.ShortStack; import org.apache.thrift.TException; +import org.apache.thrift.Utf8Helper; import org.apache.thrift.transport.TTransport; /** @@ -293,11 +292,7 @@ public final class TCompactProtocol extends TProtocol { * Write a string to the wire with a varint size preceeding. */ public void writeString(String str) throws TException { - try { - writeBinary(str.getBytes("UTF-8")); - } catch (UnsupportedEncodingException e) { - throw new TException("UTF-8 not supported!"); - } + writeBinary(Utf8Helper.encode(str)); } /** @@ -610,16 +605,12 @@ public final class TCompactProtocol extends TProtocol { return ""; } - try { - if (trans_.getBytesRemainingInBuffer() >= length) { - String str = new String(trans_.getBuffer(), trans_.getBufferPosition(), length, "UTF-8"); - trans_.consumeBuffer(length); - return str; - } else { - return new String(readBinary(length), "UTF-8"); - } - } catch (UnsupportedEncodingException e) { - throw new TException("UTF-8 not supported!"); + if (trans_.getBytesRemainingInBuffer() >= length) { + String str = Utf8Helper.decode(trans_.getBuffer(), trans_.getBufferPosition(), length); + trans_.consumeBuffer(length); + return str; + } else { + return Utf8Helper.decode(readBinary(length)); } } diff --git a/lib/java/test/org/apache/thrift/TestUtf8Helper.java b/lib/java/test/org/apache/thrift/TestUtf8Helper.java new file mode 100644 index 00000000..9d04d5af --- /dev/null +++ b/lib/java/test/org/apache/thrift/TestUtf8Helper.java @@ -0,0 +1,58 @@ +package org.apache.thrift; + +import java.io.UnsupportedEncodingException; +import java.util.Arrays; + +import junit.framework.TestCase; + +public class TestUtf8Helper extends TestCase { + private static final String NON_UNICODE_STRING = "here's some text"; + + private static final byte[] kUnicodeBytes = { + (byte)0xd3, (byte)0x80, (byte)0xe2, (byte)0x85, (byte)0xae, (byte)0xce, + (byte)0x9d, (byte)0x20, (byte)0xd0, (byte)0x9d, (byte)0xce, (byte)0xbf, + (byte)0xe2, (byte)0x85, (byte)0xbf, (byte)0xd0, (byte)0xbe, (byte)0xc9, + (byte)0xa1, (byte)0xd0, (byte)0xb3, (byte)0xd0, (byte)0xb0, (byte)0xcf, + (byte)0x81, (byte)0xe2, (byte)0x84, (byte)0x8e, (byte)0x20, (byte)0xce, + (byte)0x91, (byte)0x74, (byte)0x74, (byte)0xce, (byte)0xb1, (byte)0xe2, + (byte)0x85, (byte)0xbd, (byte)0xce, (byte)0xba, (byte)0x83, (byte)0xe2, + (byte)0x80, (byte)0xbc + }; + + private static final String UNICODE_STRING = "abc\u5639\u563b"; + private static final byte[] UNICODE_STRING_BYTES; + + private static final String UNICODE_STRING_2; + private static final byte[] UNICODE_STRING_BYTES_2; + + static { + try { + UNICODE_STRING_BYTES = UNICODE_STRING.getBytes("UTF-8"); + UNICODE_STRING_2 = new String(kUnicodeBytes, "UTF-8"); + UNICODE_STRING_BYTES_2 = UNICODE_STRING_2.getBytes("UTF-8"); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + } + + + public void testEncode() throws Exception { + byte[] bytes = NON_UNICODE_STRING.getBytes("UTF-8"); + byte[] otherBytes = Utf8Helper.encode(NON_UNICODE_STRING); + assertTrue(Arrays.equals(bytes, otherBytes)); + + otherBytes = Utf8Helper.encode(UNICODE_STRING); + assertTrue(Arrays.equals(UNICODE_STRING_BYTES, otherBytes)); + + otherBytes = Utf8Helper.encode(UNICODE_STRING_2); + assertTrue(Arrays.equals(UNICODE_STRING_BYTES_2, otherBytes)); + } + + public void testDecode() throws Exception { + byte[] bytes = NON_UNICODE_STRING.getBytes("UTF-8"); + assertEquals(NON_UNICODE_STRING, Utf8Helper.decode(bytes)); + + assertEquals(UNICODE_STRING, Utf8Helper.decode(UNICODE_STRING_BYTES)); + assertEquals(UNICODE_STRING_2, Utf8Helper.decode(UNICODE_STRING_BYTES_2)); + } +} -- 2.17.1