Thrift-1023:Thrift encoding (UTF-8) issue with Ruby 1.9.2
Client: rb
Patch: Nathan Beyer 

Fixes encoding issue for UTF-8 strings in ruby client.



git-svn-id: https://svn.apache.org/repos/asf/thrift/trunk@1395832 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/lib/rb/spec/binary_protocol_spec_shared.rb b/lib/rb/spec/binary_protocol_spec_shared.rb
index ce4931f..c49ff1f 100644
--- a/lib/rb/spec/binary_protocol_spec_shared.rb
+++ b/lib/rb/spec/binary_protocol_spec_shared.rb
@@ -1,3 +1,4 @@
+# encoding: ascii-8bit
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements. See the NOTICE file
@@ -192,13 +193,41 @@
   it "should error gracefully when trying to write a nil double" do
     lambda { @prot.write_double(nil) }.should raise_error
   end
-  
-  it "should write a string" do
-    str = "hello world"
-    @prot.write_string(str)
-    @trans.read(@trans.available).should == [str.size].pack("N") + str
+
+  if RUBY_VERSION >= '1.9'
+    it 'should write a string' do
+      str = 'abc'
+      @prot.write_string(str)
+      a = @trans.read(@trans.available)
+      a.encoding.should == Encoding::BINARY
+      a.unpack('C*').should == [0x00, 0x00, 0x00, 0x03, 0x61, 0x62, 0x63]
+    end
+
+    it 'should write a string with unicode characters' do
+      str = "abc \u20AC \u20AD".encode('UTF-8')
+      @prot.write_string(str)
+      a = @trans.read(@trans.available)
+      a.encoding.should == Encoding::BINARY
+      a.unpack('C*').should == [0x00, 0x00, 0x00, 0x0B, 0x61, 0x62, 0x63, 0x20,
+                                0xE2, 0x82, 0xAC, 0x20, 0xE2, 0x82, 0xAD]
+    end
+
+    it 'should write should write a string with unicode characters and transcoding' do
+      str = "abc \u20AC".encode('ISO-8859-15')
+      @prot.write_string(str)
+      a = @trans.read(@trans.available)
+      a.encoding.should == Encoding::BINARY
+      a.unpack('C*').should == [0x00, 0x00, 0x00, 0x07, 0x61, 0x62, 0x63, 0x20, 0xE2, 0x82, 0xAC]
+    end
+  else
+    it 'should write a string' do
+      str = 'abc'
+      @prot.write_string(str)
+      a = @trans.read(@trans.available)
+      a.unpack('C*').should == [0x00, 0x00, 0x00, 0x03, 0x61, 0x62, 0x63]
+    end
   end
-  
+
   it "should error gracefully when trying to write a nil string" do
     lambda { @prot.write_string(nil) }.should raise_error
   end
@@ -294,11 +323,32 @@
       @prot.read_double.should == f
     end
   end
-  
-  it "should read a string" do
-    str = "hello world"
-    @trans.write([str.size].pack("N") + str)
-    @prot.read_string.should == str
+
+  if RUBY_VERSION >= '1.9'
+    it 'should read a string' do
+      # i32 of value 3, followed by three characters/UTF-8 bytes 'a', 'b', 'c'
+      buffer = [0x00, 0x00, 0x00, 0x03, 0x61, 0x62, 0x63].pack('C*')
+      @trans.write(buffer)
+      a = @prot.read_string
+      a.should == 'abc'.encode('UTF-8')
+      a.encoding.should == Encoding::UTF_8
+    end
+
+    it 'should read a string containing unicode characters from UTF-8 encoded buffer' do
+      # i32 of value 3, followed by one character U+20AC made up of three bytes
+      buffer = [0x00, 0x00, 0x00, 0x03, 0xE2, 0x82, 0xAC].pack('C*')
+      @trans.write(buffer)
+      a = @prot.read_string
+      a.should == "\u20AC".encode('UTF-8')
+      a.encoding.should == Encoding::UTF_8
+    end
+  else
+    it 'should read a string' do
+      # i32 of value 3, followed by three characters/UTF-8 bytes 'a', 'b', 'c'
+      buffer = [0x00, 0x00, 0x00, 0x03, 0x61, 0x62, 0x63].pack('C*')
+      @trans.write(buffer)
+      @prot.read_string.should == 'abc'
+    end
   end
 
   it "should perform a complete rpc with no args or return" do
diff --git a/lib/rb/spec/bytes_spec.rb b/lib/rb/spec/bytes_spec.rb
new file mode 100644
index 0000000..b82e304
--- /dev/null
+++ b/lib/rb/spec/bytes_spec.rb
@@ -0,0 +1,160 @@
+# encoding: UTF-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+require 'spec_helper'
+
+describe Thrift::Bytes do
+  if RUBY_VERSION >= '1.9'
+    describe '.empty_byte_buffer' do
+      it 'should create an empty buffer' do
+        b = Thrift::Bytes.empty_byte_buffer
+        b.length.should == 0
+        b.encoding.should == Encoding::BINARY
+      end
+
+      it 'should create an empty buffer of given size' do
+        b = Thrift::Bytes.empty_byte_buffer 2
+        b.length.should == 2
+        b.getbyte(0).should == 0
+        b.getbyte(1).should == 0
+        b.encoding.should == Encoding::BINARY
+      end
+    end
+
+    describe '.force_binary_encoding' do
+      it 'should change encoding' do
+        e = 'STRING'.encode('UTF-8')
+        e.encoding.should_not == Encoding::BINARY
+        a = Thrift::Bytes.force_binary_encoding e
+        a.encoding.should == Encoding::BINARY
+      end
+    end
+
+    describe '.get_string_byte' do
+      it 'should get the byte at index' do
+        s = "\x41\x42"
+        Thrift::Bytes.get_string_byte(s, 0).should == 0x41
+        Thrift::Bytes.get_string_byte(s, 1).should == 0x42
+      end
+    end
+
+    describe '.set_string_byte' do
+      it 'should set byte value at index' do
+        s = "\x41\x42"
+        Thrift::Bytes.set_string_byte(s, 0, 0x43)
+        s.getbyte(0).should == 0x43
+        s.should == 'CB'
+      end
+    end
+
+    describe '.convert_to_utf8_byte_buffer' do
+      it 'should convert UTF-8 String to byte buffer' do
+        e = "\u20AC".encode('UTF-8') # a string with euro sign character U+20AC
+        e.length.should == 1
+
+        a = Thrift::Bytes.convert_to_utf8_byte_buffer e
+        a.encoding.should == Encoding::BINARY
+        a.length.should == 3
+        a.unpack('C*').should == [0xE2, 0x82, 0xAC]
+      end
+
+      it 'should convert ISO-8859-15 String to UTF-8 byte buffer' do
+        # Assumptions
+        e = "\u20AC".encode('ISO-8859-15') # a string with euro sign character U+20AC, then converted to ISO-8859-15
+        e.length.should == 1
+        e.unpack('C*').should == [0xA4] # euro sign is a different code point in ISO-8859-15
+
+        a = Thrift::Bytes.convert_to_utf8_byte_buffer e
+        a.encoding.should == Encoding::BINARY
+        a.length.should == 3
+        a.unpack('C*').should == [0xE2, 0x82, 0xAC]
+      end
+    end
+
+    describe '.convert_to_string' do
+      it 'should convert UTF-8 byte buffer to a UTF-8 String' do
+        e = [0xE2, 0x82, 0xAC].pack("C*")
+        e.encoding.should == Encoding::BINARY
+        a = Thrift::Bytes.convert_to_string e
+        a.encoding.should == Encoding::UTF_8
+        a.should == "\u20AC"
+      end
+    end
+
+  else # RUBY_VERSION
+    describe '.empty_byte_buffer' do
+      it 'should create an empty buffer' do
+        b = Thrift::Bytes.empty_byte_buffer
+        b.length.should == 0
+      end
+
+      it 'should create an empty buffer of given size' do
+        b = Thrift::Bytes.empty_byte_buffer 2
+        b.length.should == 2
+        b[0].should == 0
+        b[1].should == 0
+      end
+    end
+
+    describe '.force_binary_encoding' do
+      it 'should be a no-op' do
+        e = 'STRING'
+        a = Thrift::Bytes.force_binary_encoding e
+        a.should == e
+        a.should be(e)
+      end
+    end
+
+    describe '.get_string_byte' do
+      it 'should get the byte at index' do
+        s = "\x41\x42"
+        Thrift::Bytes.get_string_byte(s, 0).should == 0x41
+        Thrift::Bytes.get_string_byte(s, 1).should == 0x42
+      end
+    end
+
+    describe '.set_string_byte' do
+      it 'should set byte value at index' do
+        s = "\x41\x42"
+        Thrift::Bytes.set_string_byte(s, 0, 0x43)
+        s[0].should == 0x43
+        s.should == 'CB'
+      end
+    end
+
+    describe '.convert_to_utf8_byte_buffer' do
+      it 'should be a no-op' do
+        e = 'STRING'
+        a = Thrift::Bytes.convert_to_utf8_byte_buffer e
+        a.should == e
+        a.should be(e)
+      end
+    end
+
+    describe '.convert_to_string' do
+      it 'should be a no-op' do
+        e = 'STRING'
+        a = Thrift::Bytes.convert_to_string e
+        a.should == e
+        a.should be(e)
+      end
+    end
+  end
+end
diff --git a/lib/rb/spec/compact_protocol_spec.rb b/lib/rb/spec/compact_protocol_spec.rb
index 13c6b83..91dfe44 100644
--- a/lib/rb/spec/compact_protocol_spec.rb
+++ b/lib/rb/spec/compact_protocol_spec.rb
@@ -1,3 +1,4 @@
+# encoding: UTF-8
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements. See the NOTICE file
@@ -25,7 +26,7 @@
     :i16 => (0..14).map {|shift| [1 << shift, -(1 << shift)]}.flatten.sort,
     :i32 => (0..30).map {|shift| [1 << shift, -(1 << shift)]}.flatten.sort,
     :i64 => (0..62).map {|shift| [1 << shift, -(1 << shift)]}.flatten.sort,
-    :string => ["", "1", "short", "fourteen123456", "fifteen12345678", "1" * 127, "1" * 3000],
+    :string => ["", "1", "short", "fourteen123456", "fifteen12345678", "unicode characters: \u20AC \u20AD", "1" * 127, "1" * 3000],
     :binary => ["", "\001", "\001" * 5, "\001" * 14, "\001" * 15, "\001" * 127, "\001" * 3000],
     :double => [0.0, 1.0, -1.0, 1.1, -1.1, 10000000.1, 1.0/0.0, -1.0/0.0],
     :bool => [true, false]
diff --git a/lib/rb/spec/json_protocol_spec.rb b/lib/rb/spec/json_protocol_spec.rb
index 3945925..a294ac5 100644
--- a/lib/rb/spec/json_protocol_spec.rb
+++ b/lib/rb/spec/json_protocol_spec.rb
@@ -1,3 +1,4 @@
+# encoding: UTF-8
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements. See the NOTICE file
@@ -220,9 +221,25 @@
       @trans.read(@trans.available).should == "\"-Infinity\""
     end
 
-    it "should write string" do
-      @prot.write_string("this is a test string")
-      @trans.read(@trans.available).should == "\"this is a test string\""
+    if RUBY_VERSION >= '1.9'
+      it 'should write string' do
+        @prot.write_string('this is a test string')
+        a = @trans.read(@trans.available)
+        a.should == '"this is a test string"'.force_encoding(Encoding::BINARY)
+        a.encoding.should == Encoding::BINARY
+      end
+
+      it 'should write string with unicode characters' do
+        @prot.write_string("this is a test string with unicode characters: \u20AC \u20AD")
+        a = @trans.read(@trans.available)
+        a.should == "\"this is a test string with unicode characters: \u20AC \u20AD\"".force_encoding(Encoding::BINARY)
+        a.encoding.should == Encoding::BINARY
+      end
+    else
+      it 'should write string' do
+        @prot.write_string('this is a test string')
+        @trans.read(@trans.available).should == '"this is a test string"'
+      end
     end
 
     it "should write binary" do
@@ -461,9 +478,25 @@
       @prot.read_double.should == 12.23
     end
 
-    it "should read string" do
-      @trans.write("\"this is a test string\"")
-      @prot.read_string.should == "this is a test string"
+    if RUBY_VERSION >= '1.9'
+      it 'should read string' do
+        @trans.write('"this is a test string"'.force_encoding(Encoding::BINARY))
+        a = @prot.read_string
+        a.should == 'this is a test string'
+        a.encoding.should == Encoding::UTF_8
+      end
+
+      it 'should read string with unicode characters' do
+        @trans.write('"this is a test string with unicode characters: \u20AC \u20AD"'.force_encoding(Encoding::BINARY))
+        a = @prot.read_string
+        a.should == "this is a test string with unicode characters: \u20AC \u20AD"
+        a.encoding.should == Encoding::UTF_8
+      end
+    else
+      it 'should read string' do
+        @trans.write('"this is a test string"')
+        @prot.read_string.should == 'this is a test string'
+      end
     end
 
     it "should read binary" do