Thrift: Clean up and test TDenseProtocol

Summary:
- TDenseProtocol now includes a part of the struct fingerprint in
  the serialized message, to protect from unserialzing trash.
- A lot of cleanups and commenting for TDenseProtocol.
- A lot of test cases for same.

Reviewed By: mcslee

Test Plan: test/DenseProtoTest.cpp

Revert Plan: ok


git-svn-id: https://svn.apache.org/repos/asf/incubator/thrift/trunk@665257 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/lib/cpp/src/protocol/TDenseProtocol.cpp b/lib/cpp/src/protocol/TDenseProtocol.cpp
index 99117d6..e79d4f1 100644
--- a/lib/cpp/src/protocol/TDenseProtocol.cpp
+++ b/lib/cpp/src/protocol/TDenseProtocol.cpp
@@ -4,20 +4,95 @@
 // See accompanying file LICENSE or visit the Thrift site at:
 // http://developers.facebook.com/thrift/
 
+/*
+
+IMPLEMENTATION DETAILS
+
+TDenseProtocol was designed to have a smaller serialized form than
+TBinaryProtocol.  This is accomplished using two techniques.  The first is
+variable-length integer encoding.  We use the same technique that the Standard
+MIDI File format uses for "variable-length quantities"
+(http://en.wikipedia.org/wiki/Variable-length_quantity).
+All integers (including i16, but not byte) are first cast to uint64_t,
+then written out as variable-length quantities.  This has the unfortunate side
+effect that all negative numbers require 10 bytes, but negative numbers tend
+to be far less common than positive ones.
+
+The second technique eliminating the field ids used by TBinaryProtocol.  This
+decision required support from the Thrift compiler and also sacrifices some of
+the backward and forward compatibility of TBinaryProtocol.
+
+We considered implementing this technique by generating separate readers and
+writers for the dense protocol (this is how Pillar, Thrift's predecessor,
+worked), but this idea had a few problems:
+- Our abstractions go out the window.
+- We would have to maintain a second code generator.
+- Preserving compatibility with old versions of the structures would be a
+  nightmare.
+
+Therefore, we chose an alternate implementation that stored the description of
+the data neither in the data itself (like TBinaryProtocol) nor in the
+serialization code (like Pillar), but instead in a separate data structure,
+called a TypeSpec.  TypeSpecs are generated by the Thrift compiler
+(specifically in the t_cpp_generator), and their structure should be
+documented there (TODO(dreiss): s/should be/is/).
+
+We maintain a stack of TypeSpecs within the protocol so it knows where the
+generated code is in the reading/writing process.  For example, if we are
+writing an i32 contained in a struct bar, contained in a struct foo, then the
+stack would look like: TOP , i32 , struct bar , struct foo , BOTTOM.
+The following invariant: whenever we are about to read/write an object
+(structBegin, containerBegin, or a scalar), the TypeSpec on the top of the
+stack must match the type being read/written.  The main reasons that this
+invariant must be maintained is that if we ever start reading a structure, we
+must have its exact TypeSpec in order to pass the right tags to the
+deserializer.
+
+We use the following strategies for maintaining this invariant:
+
+- For structures, we have a separate stack of indexes, one for each structure
+  on the TypeSpec stack.  These are indexes into the list of fields in the
+  structure's TypeSpec.  When we {read,write}FieldBegin, we push on the
+  TypeSpec for the field.
+- When we begin writing a list or set, we push on the TypeSpec for the
+  element type.
+- For maps, we have a separate stack of booleans, one for each map on the
+  TypeSpec stack.  The boolean is true if we are writing the key for that
+  map, and false if we are writing the value.  Maps are the trickiest case
+  because the generated code does not call any protocol method between
+  the key and the value.  As a result, we potentially have to switch
+  between map key state and map value state after reading/writing any object.
+- This job is handled by the stateTransition method.  It is called after
+  reading/writing every object.  It pops the current TypeSpec off the stack,
+  then optionally pushes a new one on, depending on what the next TypeSpec is.
+  If it is a struct, the job is left to the next writeFieldBegin.  If it is a
+  set or list, the just-popped typespec is pushed back on.  If it is a map,
+  the top of the key/value stack is toggled, and the appropriate TypeSpec
+  is pushed.
+
+Optional fields are a little tricky also.  We write a zero byte if they are
+absent and prefix them with an 0x01 byte if they are present
+*/
+
 #define __STDC_LIMIT_MACROS
 #include <stdint.h>
 #include "TDenseProtocol.h"
 #include "TReflectionLocal.h"
 
-// XXX for debugging (duh)
+// Leaving this on for now.  Disabling it will turn off asserts, which should
+// give a performance boost.  When we have *really* thorough test cases,
+// we should drop this.
 #define DEBUG_TDENSEPROTOCOL
 
-// XXX for development.
-#define TDENSE_PROTOCOL_MEASURE_VLI
-
-// The XXX above does not apply to this.
+// NOTE: Assertions should *only* be used to detect bugs in code,
+//       either in TDenseProtocol itself, or in code using it.
+//       (For example, using the wrong TypeSpec.)
+//       Invalid data should NEVER cause an assertion failure,
+//       no matter how grossly corrupted, nor how ingeniously crafted.
 #ifdef DEBUG_TDENSEPROTOCOL
 #undef NDEBUG
+#else
+#define NDEBUG
 #endif
 #include <cassert>
 
@@ -31,6 +106,9 @@
 
 namespace facebook { namespace thrift { namespace protocol {
 
+const int TDenseProtocol::FP_PREFIX_LEN =
+  facebook::thrift::reflection::local::FP_PREFIX_LEN;
+
 // Top TypeSpec.  TypeSpec of the structure being encoded.
 #define TTS  (ts_stack_.back())  // type = TypeSpec*
 // InDeX.  Index into TTS of the current/next field to encode.
@@ -44,15 +122,25 @@
 #define ST2 (TTS->tcontainer.subtype2)
 
 
+/**
+ * Checks that @c ttype is indeed the ttype that we should be writing,
+ * according to our typespec.  Aborts if the test fails and debugging in on.
+ */
 inline void TDenseProtocol::checkTType(const TType ttype) {
   assert(!ts_stack_.empty());
   assert(TTS->ttype == ttype);
 }
 
+/**
+ * Makes sure that the TypeSpec stack is correct for the next object.
+ * See top-of-file comments.
+ */
 inline void TDenseProtocol::stateTransition() {
   TypeSpec* old_tts = ts_stack_.back();
   ts_stack_.pop_back();
 
+  // If this is the end of the top-level write, we should have just popped
+  // the TypeSpec passed to the constructor.
   if (ts_stack_.empty()) {
     assert(old_tts = type_spec_);
     return;
@@ -83,6 +171,84 @@
   }
 }
 
+
+/*
+ * Variable-length quantity functions.
+ */
+
+inline uint32_t TDenseProtocol::vlqRead(uint64_t& vlq) {
+  uint32_t used = 0;
+  uint64_t val = 0;
+  uint8_t buf[10];  // 64 bits / (7 bits/byte) = 10 bytes.
+  bool borrowed = trans_->borrow(buf, sizeof(buf));
+
+  // Fast path.  TODO(dreiss): Make it faster.
+  if (borrowed) {
+    while (true) {
+      uint8_t byte = buf[used];
+      used++;
+      val = (val << 7) | (byte & 0x7f);
+      if (!(byte & 0x80)) {
+        vlq = val;
+        trans_->consume(used);
+        return used;
+      }
+      // Have to check for invalid data so we don't crash.
+      if (UNLIKELY(used == sizeof(buf))) {
+        resetState();
+        throw TProtocolException(TProtocolException::INVALID_DATA, "Variable-length int over 10 bytes.");
+      }
+    }
+  }
+
+  // Slow path.
+  else {
+    while (true) {
+      uint8_t byte;
+      used += trans_->readAll(&byte, 1);
+      val = (val << 7) | (byte & 0x7f);
+      if (!(byte & 0x80)) {
+        vlq = val;
+        return used;
+      }
+      // Might as well check for invalid data on the slow path too.
+      if (UNLIKELY(used >= sizeof(buf))) {
+        resetState();
+        throw TProtocolException(TProtocolException::INVALID_DATA, "Variable-length int over 10 bytes.");
+      }
+    }
+  }
+}
+
+inline uint32_t TDenseProtocol::vlqWrite(uint64_t vlq) {
+  uint8_t buf[10];  // 64 bits / (7 bits/byte) = 10 bytes.
+  int32_t pos = sizeof(buf) - 1;
+
+  // Write the thing from back to front.
+  buf[pos] = vlq & 0x7f;
+  vlq >>= 7;
+  pos--;
+
+  while (vlq > 0) {
+    assert(pos >= 0);
+    buf[pos] = (vlq | 0x80);
+    vlq >>= 7;
+    pos--;
+  }
+
+  // Back up one step before writing.
+  pos++;
+
+  trans_->write(buf+pos, sizeof(buf) - pos);
+  return sizeof(buf) - pos;
+}
+
+
+
+/*
+ * Writing functions.
+ */
+
 uint32_t TDenseProtocol::writeMessageBegin(const std::string& name,
                                            const TMessageType messageType,
                                            const int32_t seqid) {
@@ -100,16 +266,27 @@
   return 0;
 }
 
-// Also implements readStructBegin.
 uint32_t TDenseProtocol::writeStructBegin(const string& name) {
+  uint32_t xfer = 0;
+
+  // The TypeSpec stack should be empty if this is the top-level read/write.
+  // If it is, we push the TypeSpec passed to the constructor.
   if (ts_stack_.empty()) {
+    assert(standalone_);
+
     if (type_spec_ == NULL) {
+      resetState();
       throw TApplicationException("TDenseProtocol: No type specified.");
     } else {
+      assert(type_spec_->ttype == T_STRUCT);
       ts_stack_.push_back(type_spec_);
+      // Write out a prefix of the structure fingerprint.
+      trans_->write(type_spec_->fp_prefix, FP_PREFIX_LEN);
+      xfer += FP_PREFIX_LEN;
     }
   }
 
+  // We need a new field index for this structure.
   idx_stack_.push_back(0);
   return 0;
 }
@@ -125,11 +302,14 @@
                                          const int16_t fieldId) {
   uint32_t xfer = 0;
 
+  // Skip over optional fields.
   while (FMT.tag != fieldId) {
     // TODO(dreiss): Old meta here.
     assert(FTS->ttype != T_STOP);
     assert(FMT.is_optional);
+    // Write a zero byte so the reader can skip it.
     xfer += subWriteBool(false);
+    // And advance to the next field.
     IDX++;
   }
 
@@ -141,20 +321,24 @@
     xfer += 1;
   }
 
-  // OMG I'm so gross.  XXX
-  if (FTS->ttype != T_STOP) {
+  // writeFieldStop shares all lot of logic up to this point.
+  // Instead of replicating it all, we just call this method from that one
+  // and use a gross special case here.
+  if (UNLIKELY(FTS->ttype != T_STOP)) {
+    // For normal fields, push the TypeSpec that we're about to use.
     ts_stack_.push_back(FTS);
   }
   return xfer;
 }
 
 uint32_t TDenseProtocol::writeFieldEnd() {
+  // Just move on to the next field.
   IDX++;
   return 0;
 }
 
 uint32_t TDenseProtocol::writeFieldStop() {
-  return writeFieldBegin("", T_STOP, 0);
+  return TDenseProtocol::writeFieldBegin("", T_STOP, 0);
 }
 
 uint32_t TDenseProtocol::writeMapBegin(const TType keyType,
@@ -172,6 +356,8 @@
 }
 
 uint32_t TDenseProtocol::writeMapEnd() {
+  // Pop off the value type, as well as our entry in the map key/value stack.
+  // stateTransition takes care of popping off our TypeSpec.
   ts_stack_.pop_back();
   mkv_stack_.pop_back();
   stateTransition();
@@ -188,6 +374,7 @@
 }
 
 uint32_t TDenseProtocol::writeListEnd() {
+  // Pop off the element type.  stateTransition takes care of popping off ours.
   ts_stack_.pop_back();
   stateTransition();
   return 0;
@@ -203,6 +390,7 @@
 }
 
 uint32_t TDenseProtocol::writeSetEnd() {
+  // Pop off the element type.  stateTransition takes care of popping off ours.
   ts_stack_.pop_back();
   stateTransition();
   return 0;
@@ -223,46 +411,19 @@
 uint32_t TDenseProtocol::writeI16(const int16_t i16) {
   checkTType(T_I16);
   stateTransition();
-
-  uint32_t rv = vliWrite(i16);
-#ifdef TDENSE_PROTOCOL_MEASURE_VLI
-  vli_save_16 += 2 - rv;
-  if (i16 < 0) {
-    negs++;
-  }
-#endif
-
-  return rv;
+  return vlqWrite(i16);
 }
 
 uint32_t TDenseProtocol::writeI32(const int32_t i32) {
   checkTType(T_I32);
   stateTransition();
-
-  uint32_t rv = vliWrite(i32);
-#ifdef TDENSE_PROTOCOL_MEASURE_VLI
-  vli_save_32 += 4 - rv;
-  if (i32 < 0) {
-    negs++;
-  }
-#endif
-
-  return rv;
+  return vlqWrite(i32);
 }
 
 uint32_t TDenseProtocol::writeI64(const int64_t i64) {
   checkTType(T_I64);
   stateTransition();
-
-  uint32_t rv = vliWrite(i64);
-#ifdef TDENSE_PROTOCOL_MEASURE_VLI
-  vli_save_64 += 8 - rv;
-  if (i64 < 0) {
-    negs++;
-  }
-#endif
-
-  return rv;
+  return vlqWrite(i64);
 }
 
 uint32_t TDenseProtocol::writeDouble(const double dub) {
@@ -278,14 +439,7 @@
 }
 
 inline uint32_t TDenseProtocol::subWriteI32(const int32_t i32) {
-  uint32_t rv = vliWrite(i32);
-#ifdef TDENSE_PROTOCOL_MEASURE_VLI
-  vli_save_sub += 4 - rv;
-  if (i32 < 0) {
-    negs++;
-  }
-#endif
-  return rv;
+  return vlqWrite(i32);
 }
 
 uint32_t TDenseProtocol::subWriteString(const std::string& str) {
@@ -297,73 +451,13 @@
   return xfer + size;
 }
 
-inline uint32_t TDenseProtocol::vliRead(uint64_t& vli) {
-  uint32_t used = 0;
-  uint64_t val = 0;
-  uint8_t buf[10];  // 64 bits / (7 bits/byte) = 10 bytes.
-  bool borrowed = trans_->borrow(buf, sizeof(buf));
 
-  // Fast path.  TODO(dreiss): Make it faster.
-  if (borrowed) {
-    while (true) {
-      uint8_t byte = buf[used];
-      used++;
-      val = (val << 7) | (byte & 0x7f);
-      if (!(byte & 0x80)) {
-        vli = val;
-        trans_->consume(used);
-        return used;
-      }
-      // Have to check for invalid data so we don't crash.
-      if (UNLIKELY(used == sizeof(buf))) {
-        throw TProtocolException(TProtocolException::INVALID_DATA, "Variable-length int over 10 bytes.");
-      }
-    }
-  }
 
-  // Slow path.
-  else {
-    while (true) {
-      uint8_t byte;
-      used += trans_->readAll(&byte, 1);
-      val = (val << 7) | (byte & 0x7f);
-      if (!(byte & 0x80)) {
-        vli = val;
-        return used;
-      }
-      // Might as well check for invalid data on the slow path too.
-      if (UNLIKELY(used >= sizeof(buf))) {
-        throw TProtocolException(TProtocolException::INVALID_DATA, "Variable-length int over 10 bytes.");
-      }
-    }
-  }
-}
-
-inline uint32_t TDenseProtocol::vliWrite(uint64_t vli) {
-  uint8_t buf[10];  // 64 bits / (7 bits/byte) = 10 bytes.
-  int32_t pos = sizeof(buf) - 1;
-
-  // Write the thing from back to front.
-  buf[pos] = vli & 0x7f;
-  vli >>= 7;
-  pos--;
-
-  while (vli > 0) {
-    assert(pos >= 0);
-    buf[pos] = (vli | 0x80);
-    vli >>= 7;
-    pos--;
-  }
-
-  // Back up one step before writing.
-  pos++;
-
-  trans_->write(buf+pos, sizeof(buf) - pos);
-  return sizeof(buf) - pos;
-}
-
-/**
+/*
  * Reading functions
+ *
+ * These have a lot of the same logic as the writing functions, so if
+ * something is confusing, look for comments in the corresponding writer.
  */
 
 uint32_t TDenseProtocol::readMessageBegin(std::string& name,
@@ -395,8 +489,32 @@
 }
 
 uint32_t TDenseProtocol::readStructBegin(string& name) {
-  // TODO(dreiss): Any chance this gets inlined?
-  return TDenseProtocol::writeStructBegin(name);
+  uint32_t xfer = 0;
+
+  if (ts_stack_.empty()) {
+    assert(standalone_);
+
+    if (type_spec_ == NULL) {
+      resetState();
+      throw TApplicationException("TDenseProtocol: No type specified.");
+    } else {
+      assert(type_spec_->ttype == T_STRUCT);
+      ts_stack_.push_back(type_spec_);
+
+      // Check the fingerprint prefix.
+      uint8_t buf[FP_PREFIX_LEN];
+      xfer += trans_->read(buf, FP_PREFIX_LEN);
+      if (std::memcmp(buf, type_spec_->fp_prefix, FP_PREFIX_LEN) != 0) {
+        resetState();
+        throw TProtocolException(TProtocolException::INVALID_DATA,
+            "Fingerprint in data does not match type_spec.");
+      }
+    }
+  }
+
+  // We need a new field index for this structure.
+  idx_stack_.push_back(0);
+  return 0;
 }
 
 uint32_t TDenseProtocol::readStructEnd() {
@@ -410,6 +528,7 @@
                                         int16_t& fieldId) {
   uint32_t xfer = 0;
 
+  // For optional fields, check to see if they are there.
   while (FMT.is_optional) {
     bool is_present;
     xfer += subReadBool(is_present);
@@ -419,10 +538,14 @@
     IDX++;
   }
 
+  // Once we hit a mandatory field, or an optional field that is present,
+  // we know that FMT and FTS point to the appropriate field.
+
   fieldId   = FMT.tag;
   fieldType = FTS->ttype;
 
-  // OMG I'm so gross.  XXX
+  // Normally, we push the TypeSpec that we are about to read,
+  // but no reading is done for T_STOP.
   if (FTS->ttype != T_STOP) {
     ts_stack_.push_back(FTS);
   }
@@ -443,8 +566,10 @@
   int32_t sizei;
   xfer += subReadI32(sizei);
   if (sizei < 0) {
+    resetState();
     throw TProtocolException(TProtocolException::NEGATIVE_SIZE);
   } else if (container_limit_ && sizei > container_limit_) {
+    resetState();
     throw TProtocolException(TProtocolException::SIZE_LIMIT);
   }
   size = (uint32_t)sizei;
@@ -473,8 +598,10 @@
   int32_t sizei;
   xfer += subReadI32(sizei);
   if (sizei < 0) {
+    resetState();
     throw TProtocolException(TProtocolException::NEGATIVE_SIZE);
   } else if (container_limit_ && sizei > container_limit_) {
+    resetState();
     throw TProtocolException(TProtocolException::SIZE_LIMIT);
   }
   size = (uint32_t)sizei;
@@ -500,8 +627,10 @@
   int32_t sizei;
   xfer += subReadI32(sizei);
   if (sizei < 0) {
+    resetState();
     throw TProtocolException(TProtocolException::NEGATIVE_SIZE);
   } else if (container_limit_ && sizei > container_limit_) {
+    resetState();
     throw TProtocolException(TProtocolException::SIZE_LIMIT);
   }
   size = (uint32_t)sizei;
@@ -535,9 +664,10 @@
   checkTType(T_I16);
   stateTransition();
   uint64_t u64;
-  uint32_t rv = vliRead(u64);
+  uint32_t rv = vlqRead(u64);
   int64_t val = (int64_t)u64;
   if (UNLIKELY(val > INT16_MAX || val < INT16_MIN)) {
+    resetState();
     throw TProtocolException(TProtocolException::INVALID_DATA,
                              "i16 out of range.");
   }
@@ -549,9 +679,10 @@
   checkTType(T_I32);
   stateTransition();
   uint64_t u64;
-  uint32_t rv = vliRead(u64);
+  uint32_t rv = vlqRead(u64);
   int64_t val = (int64_t)u64;
   if (UNLIKELY(val > INT32_MAX || val < INT32_MIN)) {
+    resetState();
     throw TProtocolException(TProtocolException::INVALID_DATA,
                              "i32 out of range.");
   }
@@ -563,9 +694,10 @@
   checkTType(T_I64);
   stateTransition();
   uint64_t u64;
-  uint32_t rv = vliRead(u64);
+  uint32_t rv = vlqRead(u64);
   int64_t val = (int64_t)u64;
   if (UNLIKELY(val > INT64_MAX || val < INT64_MIN)) {
+    resetState();
     throw TProtocolException(TProtocolException::INVALID_DATA,
                              "i64 out of range.");
   }
@@ -587,9 +719,10 @@
 
 uint32_t TDenseProtocol::subReadI32(int32_t& i32) {
   uint64_t u64;
-  uint32_t rv = vliRead(u64);
+  uint32_t rv = vlqRead(u64);
   int64_t val = (int64_t)u64;
   if (UNLIKELY(val > INT32_MAX || val < INT32_MIN)) {
+    resetState();
     throw TProtocolException(TProtocolException::INVALID_DATA,
                              "i32 out of range.");
   }