Thrift-1023:Thrift encoding (UTF-8) issue with Ruby 1.9.2
Client: rb
Patch: Nathan Beyer
Fixes encoding issue for UTF-8 strings in ruby client.
git-svn-id: https://svn.apache.org/repos/asf/thrift/trunk@1395832 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/lib/rb/spec/binary_protocol_spec_shared.rb b/lib/rb/spec/binary_protocol_spec_shared.rb
index ce4931f..c49ff1f 100644
--- a/lib/rb/spec/binary_protocol_spec_shared.rb
+++ b/lib/rb/spec/binary_protocol_spec_shared.rb
@@ -1,3 +1,4 @@
+# encoding: ascii-8bit
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
@@ -192,13 +193,41 @@
it "should error gracefully when trying to write a nil double" do
lambda { @prot.write_double(nil) }.should raise_error
end
-
- it "should write a string" do
- str = "hello world"
- @prot.write_string(str)
- @trans.read(@trans.available).should == [str.size].pack("N") + str
+
+ if RUBY_VERSION >= '1.9'
+ it 'should write a string' do
+ str = 'abc'
+ @prot.write_string(str)
+ a = @trans.read(@trans.available)
+ a.encoding.should == Encoding::BINARY
+ a.unpack('C*').should == [0x00, 0x00, 0x00, 0x03, 0x61, 0x62, 0x63]
+ end
+
+ it 'should write a string with unicode characters' do
+ str = "abc \u20AC \u20AD".encode('UTF-8')
+ @prot.write_string(str)
+ a = @trans.read(@trans.available)
+ a.encoding.should == Encoding::BINARY
+ a.unpack('C*').should == [0x00, 0x00, 0x00, 0x0B, 0x61, 0x62, 0x63, 0x20,
+ 0xE2, 0x82, 0xAC, 0x20, 0xE2, 0x82, 0xAD]
+ end
+
+ it 'should write should write a string with unicode characters and transcoding' do
+ str = "abc \u20AC".encode('ISO-8859-15')
+ @prot.write_string(str)
+ a = @trans.read(@trans.available)
+ a.encoding.should == Encoding::BINARY
+ a.unpack('C*').should == [0x00, 0x00, 0x00, 0x07, 0x61, 0x62, 0x63, 0x20, 0xE2, 0x82, 0xAC]
+ end
+ else
+ it 'should write a string' do
+ str = 'abc'
+ @prot.write_string(str)
+ a = @trans.read(@trans.available)
+ a.unpack('C*').should == [0x00, 0x00, 0x00, 0x03, 0x61, 0x62, 0x63]
+ end
end
-
+
it "should error gracefully when trying to write a nil string" do
lambda { @prot.write_string(nil) }.should raise_error
end
@@ -294,11 +323,32 @@
@prot.read_double.should == f
end
end
-
- it "should read a string" do
- str = "hello world"
- @trans.write([str.size].pack("N") + str)
- @prot.read_string.should == str
+
+ if RUBY_VERSION >= '1.9'
+ it 'should read a string' do
+ # i32 of value 3, followed by three characters/UTF-8 bytes 'a', 'b', 'c'
+ buffer = [0x00, 0x00, 0x00, 0x03, 0x61, 0x62, 0x63].pack('C*')
+ @trans.write(buffer)
+ a = @prot.read_string
+ a.should == 'abc'.encode('UTF-8')
+ a.encoding.should == Encoding::UTF_8
+ end
+
+ it 'should read a string containing unicode characters from UTF-8 encoded buffer' do
+ # i32 of value 3, followed by one character U+20AC made up of three bytes
+ buffer = [0x00, 0x00, 0x00, 0x03, 0xE2, 0x82, 0xAC].pack('C*')
+ @trans.write(buffer)
+ a = @prot.read_string
+ a.should == "\u20AC".encode('UTF-8')
+ a.encoding.should == Encoding::UTF_8
+ end
+ else
+ it 'should read a string' do
+ # i32 of value 3, followed by three characters/UTF-8 bytes 'a', 'b', 'c'
+ buffer = [0x00, 0x00, 0x00, 0x03, 0x61, 0x62, 0x63].pack('C*')
+ @trans.write(buffer)
+ @prot.read_string.should == 'abc'
+ end
end
it "should perform a complete rpc with no args or return" do
diff --git a/lib/rb/spec/bytes_spec.rb b/lib/rb/spec/bytes_spec.rb
new file mode 100644
index 0000000..b82e304
--- /dev/null
+++ b/lib/rb/spec/bytes_spec.rb
@@ -0,0 +1,160 @@
+# encoding: UTF-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+require 'spec_helper'
+
+describe Thrift::Bytes do
+ if RUBY_VERSION >= '1.9'
+ describe '.empty_byte_buffer' do
+ it 'should create an empty buffer' do
+ b = Thrift::Bytes.empty_byte_buffer
+ b.length.should == 0
+ b.encoding.should == Encoding::BINARY
+ end
+
+ it 'should create an empty buffer of given size' do
+ b = Thrift::Bytes.empty_byte_buffer 2
+ b.length.should == 2
+ b.getbyte(0).should == 0
+ b.getbyte(1).should == 0
+ b.encoding.should == Encoding::BINARY
+ end
+ end
+
+ describe '.force_binary_encoding' do
+ it 'should change encoding' do
+ e = 'STRING'.encode('UTF-8')
+ e.encoding.should_not == Encoding::BINARY
+ a = Thrift::Bytes.force_binary_encoding e
+ a.encoding.should == Encoding::BINARY
+ end
+ end
+
+ describe '.get_string_byte' do
+ it 'should get the byte at index' do
+ s = "\x41\x42"
+ Thrift::Bytes.get_string_byte(s, 0).should == 0x41
+ Thrift::Bytes.get_string_byte(s, 1).should == 0x42
+ end
+ end
+
+ describe '.set_string_byte' do
+ it 'should set byte value at index' do
+ s = "\x41\x42"
+ Thrift::Bytes.set_string_byte(s, 0, 0x43)
+ s.getbyte(0).should == 0x43
+ s.should == 'CB'
+ end
+ end
+
+ describe '.convert_to_utf8_byte_buffer' do
+ it 'should convert UTF-8 String to byte buffer' do
+ e = "\u20AC".encode('UTF-8') # a string with euro sign character U+20AC
+ e.length.should == 1
+
+ a = Thrift::Bytes.convert_to_utf8_byte_buffer e
+ a.encoding.should == Encoding::BINARY
+ a.length.should == 3
+ a.unpack('C*').should == [0xE2, 0x82, 0xAC]
+ end
+
+ it 'should convert ISO-8859-15 String to UTF-8 byte buffer' do
+ # Assumptions
+ e = "\u20AC".encode('ISO-8859-15') # a string with euro sign character U+20AC, then converted to ISO-8859-15
+ e.length.should == 1
+ e.unpack('C*').should == [0xA4] # euro sign is a different code point in ISO-8859-15
+
+ a = Thrift::Bytes.convert_to_utf8_byte_buffer e
+ a.encoding.should == Encoding::BINARY
+ a.length.should == 3
+ a.unpack('C*').should == [0xE2, 0x82, 0xAC]
+ end
+ end
+
+ describe '.convert_to_string' do
+ it 'should convert UTF-8 byte buffer to a UTF-8 String' do
+ e = [0xE2, 0x82, 0xAC].pack("C*")
+ e.encoding.should == Encoding::BINARY
+ a = Thrift::Bytes.convert_to_string e
+ a.encoding.should == Encoding::UTF_8
+ a.should == "\u20AC"
+ end
+ end
+
+ else # RUBY_VERSION
+ describe '.empty_byte_buffer' do
+ it 'should create an empty buffer' do
+ b = Thrift::Bytes.empty_byte_buffer
+ b.length.should == 0
+ end
+
+ it 'should create an empty buffer of given size' do
+ b = Thrift::Bytes.empty_byte_buffer 2
+ b.length.should == 2
+ b[0].should == 0
+ b[1].should == 0
+ end
+ end
+
+ describe '.force_binary_encoding' do
+ it 'should be a no-op' do
+ e = 'STRING'
+ a = Thrift::Bytes.force_binary_encoding e
+ a.should == e
+ a.should be(e)
+ end
+ end
+
+ describe '.get_string_byte' do
+ it 'should get the byte at index' do
+ s = "\x41\x42"
+ Thrift::Bytes.get_string_byte(s, 0).should == 0x41
+ Thrift::Bytes.get_string_byte(s, 1).should == 0x42
+ end
+ end
+
+ describe '.set_string_byte' do
+ it 'should set byte value at index' do
+ s = "\x41\x42"
+ Thrift::Bytes.set_string_byte(s, 0, 0x43)
+ s[0].should == 0x43
+ s.should == 'CB'
+ end
+ end
+
+ describe '.convert_to_utf8_byte_buffer' do
+ it 'should be a no-op' do
+ e = 'STRING'
+ a = Thrift::Bytes.convert_to_utf8_byte_buffer e
+ a.should == e
+ a.should be(e)
+ end
+ end
+
+ describe '.convert_to_string' do
+ it 'should be a no-op' do
+ e = 'STRING'
+ a = Thrift::Bytes.convert_to_string e
+ a.should == e
+ a.should be(e)
+ end
+ end
+ end
+end
diff --git a/lib/rb/spec/compact_protocol_spec.rb b/lib/rb/spec/compact_protocol_spec.rb
index 13c6b83..91dfe44 100644
--- a/lib/rb/spec/compact_protocol_spec.rb
+++ b/lib/rb/spec/compact_protocol_spec.rb
@@ -1,3 +1,4 @@
+# encoding: UTF-8
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
@@ -25,7 +26,7 @@
:i16 => (0..14).map {|shift| [1 << shift, -(1 << shift)]}.flatten.sort,
:i32 => (0..30).map {|shift| [1 << shift, -(1 << shift)]}.flatten.sort,
:i64 => (0..62).map {|shift| [1 << shift, -(1 << shift)]}.flatten.sort,
- :string => ["", "1", "short", "fourteen123456", "fifteen12345678", "1" * 127, "1" * 3000],
+ :string => ["", "1", "short", "fourteen123456", "fifteen12345678", "unicode characters: \u20AC \u20AD", "1" * 127, "1" * 3000],
:binary => ["", "\001", "\001" * 5, "\001" * 14, "\001" * 15, "\001" * 127, "\001" * 3000],
:double => [0.0, 1.0, -1.0, 1.1, -1.1, 10000000.1, 1.0/0.0, -1.0/0.0],
:bool => [true, false]
diff --git a/lib/rb/spec/json_protocol_spec.rb b/lib/rb/spec/json_protocol_spec.rb
index 3945925..a294ac5 100644
--- a/lib/rb/spec/json_protocol_spec.rb
+++ b/lib/rb/spec/json_protocol_spec.rb
@@ -1,3 +1,4 @@
+# encoding: UTF-8
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
@@ -220,9 +221,25 @@
@trans.read(@trans.available).should == "\"-Infinity\""
end
- it "should write string" do
- @prot.write_string("this is a test string")
- @trans.read(@trans.available).should == "\"this is a test string\""
+ if RUBY_VERSION >= '1.9'
+ it 'should write string' do
+ @prot.write_string('this is a test string')
+ a = @trans.read(@trans.available)
+ a.should == '"this is a test string"'.force_encoding(Encoding::BINARY)
+ a.encoding.should == Encoding::BINARY
+ end
+
+ it 'should write string with unicode characters' do
+ @prot.write_string("this is a test string with unicode characters: \u20AC \u20AD")
+ a = @trans.read(@trans.available)
+ a.should == "\"this is a test string with unicode characters: \u20AC \u20AD\"".force_encoding(Encoding::BINARY)
+ a.encoding.should == Encoding::BINARY
+ end
+ else
+ it 'should write string' do
+ @prot.write_string('this is a test string')
+ @trans.read(@trans.available).should == '"this is a test string"'
+ end
end
it "should write binary" do
@@ -461,9 +478,25 @@
@prot.read_double.should == 12.23
end
- it "should read string" do
- @trans.write("\"this is a test string\"")
- @prot.read_string.should == "this is a test string"
+ if RUBY_VERSION >= '1.9'
+ it 'should read string' do
+ @trans.write('"this is a test string"'.force_encoding(Encoding::BINARY))
+ a = @prot.read_string
+ a.should == 'this is a test string'
+ a.encoding.should == Encoding::UTF_8
+ end
+
+ it 'should read string with unicode characters' do
+ @trans.write('"this is a test string with unicode characters: \u20AC \u20AD"'.force_encoding(Encoding::BINARY))
+ a = @prot.read_string
+ a.should == "this is a test string with unicode characters: \u20AC \u20AD"
+ a.encoding.should == Encoding::UTF_8
+ end
+ else
+ it 'should read string' do
+ @trans.write('"this is a test string"')
+ @prot.read_string.should == 'this is a test string'
+ end
end
it "should read binary" do