Thrift: Clean up and test TDenseProtocol
Summary:
- TDenseProtocol now includes a part of the struct fingerprint in
the serialized message, to protect from unserialzing trash.
- A lot of cleanups and commenting for TDenseProtocol.
- A lot of test cases for same.
Reviewed By: mcslee
Test Plan: test/DenseProtoTest.cpp
Revert Plan: ok
git-svn-id: https://svn.apache.org/repos/asf/incubator/thrift/trunk@665257 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/lib/cpp/src/protocol/TDenseProtocol.cpp b/lib/cpp/src/protocol/TDenseProtocol.cpp
index 99117d6..e79d4f1 100644
--- a/lib/cpp/src/protocol/TDenseProtocol.cpp
+++ b/lib/cpp/src/protocol/TDenseProtocol.cpp
@@ -4,20 +4,95 @@
// See accompanying file LICENSE or visit the Thrift site at:
// http://developers.facebook.com/thrift/
+/*
+
+IMPLEMENTATION DETAILS
+
+TDenseProtocol was designed to have a smaller serialized form than
+TBinaryProtocol. This is accomplished using two techniques. The first is
+variable-length integer encoding. We use the same technique that the Standard
+MIDI File format uses for "variable-length quantities"
+(http://en.wikipedia.org/wiki/Variable-length_quantity).
+All integers (including i16, but not byte) are first cast to uint64_t,
+then written out as variable-length quantities. This has the unfortunate side
+effect that all negative numbers require 10 bytes, but negative numbers tend
+to be far less common than positive ones.
+
+The second technique eliminating the field ids used by TBinaryProtocol. This
+decision required support from the Thrift compiler and also sacrifices some of
+the backward and forward compatibility of TBinaryProtocol.
+
+We considered implementing this technique by generating separate readers and
+writers for the dense protocol (this is how Pillar, Thrift's predecessor,
+worked), but this idea had a few problems:
+- Our abstractions go out the window.
+- We would have to maintain a second code generator.
+- Preserving compatibility with old versions of the structures would be a
+ nightmare.
+
+Therefore, we chose an alternate implementation that stored the description of
+the data neither in the data itself (like TBinaryProtocol) nor in the
+serialization code (like Pillar), but instead in a separate data structure,
+called a TypeSpec. TypeSpecs are generated by the Thrift compiler
+(specifically in the t_cpp_generator), and their structure should be
+documented there (TODO(dreiss): s/should be/is/).
+
+We maintain a stack of TypeSpecs within the protocol so it knows where the
+generated code is in the reading/writing process. For example, if we are
+writing an i32 contained in a struct bar, contained in a struct foo, then the
+stack would look like: TOP , i32 , struct bar , struct foo , BOTTOM.
+The following invariant: whenever we are about to read/write an object
+(structBegin, containerBegin, or a scalar), the TypeSpec on the top of the
+stack must match the type being read/written. The main reasons that this
+invariant must be maintained is that if we ever start reading a structure, we
+must have its exact TypeSpec in order to pass the right tags to the
+deserializer.
+
+We use the following strategies for maintaining this invariant:
+
+- For structures, we have a separate stack of indexes, one for each structure
+ on the TypeSpec stack. These are indexes into the list of fields in the
+ structure's TypeSpec. When we {read,write}FieldBegin, we push on the
+ TypeSpec for the field.
+- When we begin writing a list or set, we push on the TypeSpec for the
+ element type.
+- For maps, we have a separate stack of booleans, one for each map on the
+ TypeSpec stack. The boolean is true if we are writing the key for that
+ map, and false if we are writing the value. Maps are the trickiest case
+ because the generated code does not call any protocol method between
+ the key and the value. As a result, we potentially have to switch
+ between map key state and map value state after reading/writing any object.
+- This job is handled by the stateTransition method. It is called after
+ reading/writing every object. It pops the current TypeSpec off the stack,
+ then optionally pushes a new one on, depending on what the next TypeSpec is.
+ If it is a struct, the job is left to the next writeFieldBegin. If it is a
+ set or list, the just-popped typespec is pushed back on. If it is a map,
+ the top of the key/value stack is toggled, and the appropriate TypeSpec
+ is pushed.
+
+Optional fields are a little tricky also. We write a zero byte if they are
+absent and prefix them with an 0x01 byte if they are present
+*/
+
#define __STDC_LIMIT_MACROS
#include <stdint.h>
#include "TDenseProtocol.h"
#include "TReflectionLocal.h"
-// XXX for debugging (duh)
+// Leaving this on for now. Disabling it will turn off asserts, which should
+// give a performance boost. When we have *really* thorough test cases,
+// we should drop this.
#define DEBUG_TDENSEPROTOCOL
-// XXX for development.
-#define TDENSE_PROTOCOL_MEASURE_VLI
-
-// The XXX above does not apply to this.
+// NOTE: Assertions should *only* be used to detect bugs in code,
+// either in TDenseProtocol itself, or in code using it.
+// (For example, using the wrong TypeSpec.)
+// Invalid data should NEVER cause an assertion failure,
+// no matter how grossly corrupted, nor how ingeniously crafted.
#ifdef DEBUG_TDENSEPROTOCOL
#undef NDEBUG
+#else
+#define NDEBUG
#endif
#include <cassert>
@@ -31,6 +106,9 @@
namespace facebook { namespace thrift { namespace protocol {
+const int TDenseProtocol::FP_PREFIX_LEN =
+ facebook::thrift::reflection::local::FP_PREFIX_LEN;
+
// Top TypeSpec. TypeSpec of the structure being encoded.
#define TTS (ts_stack_.back()) // type = TypeSpec*
// InDeX. Index into TTS of the current/next field to encode.
@@ -44,15 +122,25 @@
#define ST2 (TTS->tcontainer.subtype2)
+/**
+ * Checks that @c ttype is indeed the ttype that we should be writing,
+ * according to our typespec. Aborts if the test fails and debugging in on.
+ */
inline void TDenseProtocol::checkTType(const TType ttype) {
assert(!ts_stack_.empty());
assert(TTS->ttype == ttype);
}
+/**
+ * Makes sure that the TypeSpec stack is correct for the next object.
+ * See top-of-file comments.
+ */
inline void TDenseProtocol::stateTransition() {
TypeSpec* old_tts = ts_stack_.back();
ts_stack_.pop_back();
+ // If this is the end of the top-level write, we should have just popped
+ // the TypeSpec passed to the constructor.
if (ts_stack_.empty()) {
assert(old_tts = type_spec_);
return;
@@ -83,6 +171,84 @@
}
}
+
+/*
+ * Variable-length quantity functions.
+ */
+
+inline uint32_t TDenseProtocol::vlqRead(uint64_t& vlq) {
+ uint32_t used = 0;
+ uint64_t val = 0;
+ uint8_t buf[10]; // 64 bits / (7 bits/byte) = 10 bytes.
+ bool borrowed = trans_->borrow(buf, sizeof(buf));
+
+ // Fast path. TODO(dreiss): Make it faster.
+ if (borrowed) {
+ while (true) {
+ uint8_t byte = buf[used];
+ used++;
+ val = (val << 7) | (byte & 0x7f);
+ if (!(byte & 0x80)) {
+ vlq = val;
+ trans_->consume(used);
+ return used;
+ }
+ // Have to check for invalid data so we don't crash.
+ if (UNLIKELY(used == sizeof(buf))) {
+ resetState();
+ throw TProtocolException(TProtocolException::INVALID_DATA, "Variable-length int over 10 bytes.");
+ }
+ }
+ }
+
+ // Slow path.
+ else {
+ while (true) {
+ uint8_t byte;
+ used += trans_->readAll(&byte, 1);
+ val = (val << 7) | (byte & 0x7f);
+ if (!(byte & 0x80)) {
+ vlq = val;
+ return used;
+ }
+ // Might as well check for invalid data on the slow path too.
+ if (UNLIKELY(used >= sizeof(buf))) {
+ resetState();
+ throw TProtocolException(TProtocolException::INVALID_DATA, "Variable-length int over 10 bytes.");
+ }
+ }
+ }
+}
+
+inline uint32_t TDenseProtocol::vlqWrite(uint64_t vlq) {
+ uint8_t buf[10]; // 64 bits / (7 bits/byte) = 10 bytes.
+ int32_t pos = sizeof(buf) - 1;
+
+ // Write the thing from back to front.
+ buf[pos] = vlq & 0x7f;
+ vlq >>= 7;
+ pos--;
+
+ while (vlq > 0) {
+ assert(pos >= 0);
+ buf[pos] = (vlq | 0x80);
+ vlq >>= 7;
+ pos--;
+ }
+
+ // Back up one step before writing.
+ pos++;
+
+ trans_->write(buf+pos, sizeof(buf) - pos);
+ return sizeof(buf) - pos;
+}
+
+
+
+/*
+ * Writing functions.
+ */
+
uint32_t TDenseProtocol::writeMessageBegin(const std::string& name,
const TMessageType messageType,
const int32_t seqid) {
@@ -100,16 +266,27 @@
return 0;
}
-// Also implements readStructBegin.
uint32_t TDenseProtocol::writeStructBegin(const string& name) {
+ uint32_t xfer = 0;
+
+ // The TypeSpec stack should be empty if this is the top-level read/write.
+ // If it is, we push the TypeSpec passed to the constructor.
if (ts_stack_.empty()) {
+ assert(standalone_);
+
if (type_spec_ == NULL) {
+ resetState();
throw TApplicationException("TDenseProtocol: No type specified.");
} else {
+ assert(type_spec_->ttype == T_STRUCT);
ts_stack_.push_back(type_spec_);
+ // Write out a prefix of the structure fingerprint.
+ trans_->write(type_spec_->fp_prefix, FP_PREFIX_LEN);
+ xfer += FP_PREFIX_LEN;
}
}
+ // We need a new field index for this structure.
idx_stack_.push_back(0);
return 0;
}
@@ -125,11 +302,14 @@
const int16_t fieldId) {
uint32_t xfer = 0;
+ // Skip over optional fields.
while (FMT.tag != fieldId) {
// TODO(dreiss): Old meta here.
assert(FTS->ttype != T_STOP);
assert(FMT.is_optional);
+ // Write a zero byte so the reader can skip it.
xfer += subWriteBool(false);
+ // And advance to the next field.
IDX++;
}
@@ -141,20 +321,24 @@
xfer += 1;
}
- // OMG I'm so gross. XXX
- if (FTS->ttype != T_STOP) {
+ // writeFieldStop shares all lot of logic up to this point.
+ // Instead of replicating it all, we just call this method from that one
+ // and use a gross special case here.
+ if (UNLIKELY(FTS->ttype != T_STOP)) {
+ // For normal fields, push the TypeSpec that we're about to use.
ts_stack_.push_back(FTS);
}
return xfer;
}
uint32_t TDenseProtocol::writeFieldEnd() {
+ // Just move on to the next field.
IDX++;
return 0;
}
uint32_t TDenseProtocol::writeFieldStop() {
- return writeFieldBegin("", T_STOP, 0);
+ return TDenseProtocol::writeFieldBegin("", T_STOP, 0);
}
uint32_t TDenseProtocol::writeMapBegin(const TType keyType,
@@ -172,6 +356,8 @@
}
uint32_t TDenseProtocol::writeMapEnd() {
+ // Pop off the value type, as well as our entry in the map key/value stack.
+ // stateTransition takes care of popping off our TypeSpec.
ts_stack_.pop_back();
mkv_stack_.pop_back();
stateTransition();
@@ -188,6 +374,7 @@
}
uint32_t TDenseProtocol::writeListEnd() {
+ // Pop off the element type. stateTransition takes care of popping off ours.
ts_stack_.pop_back();
stateTransition();
return 0;
@@ -203,6 +390,7 @@
}
uint32_t TDenseProtocol::writeSetEnd() {
+ // Pop off the element type. stateTransition takes care of popping off ours.
ts_stack_.pop_back();
stateTransition();
return 0;
@@ -223,46 +411,19 @@
uint32_t TDenseProtocol::writeI16(const int16_t i16) {
checkTType(T_I16);
stateTransition();
-
- uint32_t rv = vliWrite(i16);
-#ifdef TDENSE_PROTOCOL_MEASURE_VLI
- vli_save_16 += 2 - rv;
- if (i16 < 0) {
- negs++;
- }
-#endif
-
- return rv;
+ return vlqWrite(i16);
}
uint32_t TDenseProtocol::writeI32(const int32_t i32) {
checkTType(T_I32);
stateTransition();
-
- uint32_t rv = vliWrite(i32);
-#ifdef TDENSE_PROTOCOL_MEASURE_VLI
- vli_save_32 += 4 - rv;
- if (i32 < 0) {
- negs++;
- }
-#endif
-
- return rv;
+ return vlqWrite(i32);
}
uint32_t TDenseProtocol::writeI64(const int64_t i64) {
checkTType(T_I64);
stateTransition();
-
- uint32_t rv = vliWrite(i64);
-#ifdef TDENSE_PROTOCOL_MEASURE_VLI
- vli_save_64 += 8 - rv;
- if (i64 < 0) {
- negs++;
- }
-#endif
-
- return rv;
+ return vlqWrite(i64);
}
uint32_t TDenseProtocol::writeDouble(const double dub) {
@@ -278,14 +439,7 @@
}
inline uint32_t TDenseProtocol::subWriteI32(const int32_t i32) {
- uint32_t rv = vliWrite(i32);
-#ifdef TDENSE_PROTOCOL_MEASURE_VLI
- vli_save_sub += 4 - rv;
- if (i32 < 0) {
- negs++;
- }
-#endif
- return rv;
+ return vlqWrite(i32);
}
uint32_t TDenseProtocol::subWriteString(const std::string& str) {
@@ -297,73 +451,13 @@
return xfer + size;
}
-inline uint32_t TDenseProtocol::vliRead(uint64_t& vli) {
- uint32_t used = 0;
- uint64_t val = 0;
- uint8_t buf[10]; // 64 bits / (7 bits/byte) = 10 bytes.
- bool borrowed = trans_->borrow(buf, sizeof(buf));
- // Fast path. TODO(dreiss): Make it faster.
- if (borrowed) {
- while (true) {
- uint8_t byte = buf[used];
- used++;
- val = (val << 7) | (byte & 0x7f);
- if (!(byte & 0x80)) {
- vli = val;
- trans_->consume(used);
- return used;
- }
- // Have to check for invalid data so we don't crash.
- if (UNLIKELY(used == sizeof(buf))) {
- throw TProtocolException(TProtocolException::INVALID_DATA, "Variable-length int over 10 bytes.");
- }
- }
- }
- // Slow path.
- else {
- while (true) {
- uint8_t byte;
- used += trans_->readAll(&byte, 1);
- val = (val << 7) | (byte & 0x7f);
- if (!(byte & 0x80)) {
- vli = val;
- return used;
- }
- // Might as well check for invalid data on the slow path too.
- if (UNLIKELY(used >= sizeof(buf))) {
- throw TProtocolException(TProtocolException::INVALID_DATA, "Variable-length int over 10 bytes.");
- }
- }
- }
-}
-
-inline uint32_t TDenseProtocol::vliWrite(uint64_t vli) {
- uint8_t buf[10]; // 64 bits / (7 bits/byte) = 10 bytes.
- int32_t pos = sizeof(buf) - 1;
-
- // Write the thing from back to front.
- buf[pos] = vli & 0x7f;
- vli >>= 7;
- pos--;
-
- while (vli > 0) {
- assert(pos >= 0);
- buf[pos] = (vli | 0x80);
- vli >>= 7;
- pos--;
- }
-
- // Back up one step before writing.
- pos++;
-
- trans_->write(buf+pos, sizeof(buf) - pos);
- return sizeof(buf) - pos;
-}
-
-/**
+/*
* Reading functions
+ *
+ * These have a lot of the same logic as the writing functions, so if
+ * something is confusing, look for comments in the corresponding writer.
*/
uint32_t TDenseProtocol::readMessageBegin(std::string& name,
@@ -395,8 +489,32 @@
}
uint32_t TDenseProtocol::readStructBegin(string& name) {
- // TODO(dreiss): Any chance this gets inlined?
- return TDenseProtocol::writeStructBegin(name);
+ uint32_t xfer = 0;
+
+ if (ts_stack_.empty()) {
+ assert(standalone_);
+
+ if (type_spec_ == NULL) {
+ resetState();
+ throw TApplicationException("TDenseProtocol: No type specified.");
+ } else {
+ assert(type_spec_->ttype == T_STRUCT);
+ ts_stack_.push_back(type_spec_);
+
+ // Check the fingerprint prefix.
+ uint8_t buf[FP_PREFIX_LEN];
+ xfer += trans_->read(buf, FP_PREFIX_LEN);
+ if (std::memcmp(buf, type_spec_->fp_prefix, FP_PREFIX_LEN) != 0) {
+ resetState();
+ throw TProtocolException(TProtocolException::INVALID_DATA,
+ "Fingerprint in data does not match type_spec.");
+ }
+ }
+ }
+
+ // We need a new field index for this structure.
+ idx_stack_.push_back(0);
+ return 0;
}
uint32_t TDenseProtocol::readStructEnd() {
@@ -410,6 +528,7 @@
int16_t& fieldId) {
uint32_t xfer = 0;
+ // For optional fields, check to see if they are there.
while (FMT.is_optional) {
bool is_present;
xfer += subReadBool(is_present);
@@ -419,10 +538,14 @@
IDX++;
}
+ // Once we hit a mandatory field, or an optional field that is present,
+ // we know that FMT and FTS point to the appropriate field.
+
fieldId = FMT.tag;
fieldType = FTS->ttype;
- // OMG I'm so gross. XXX
+ // Normally, we push the TypeSpec that we are about to read,
+ // but no reading is done for T_STOP.
if (FTS->ttype != T_STOP) {
ts_stack_.push_back(FTS);
}
@@ -443,8 +566,10 @@
int32_t sizei;
xfer += subReadI32(sizei);
if (sizei < 0) {
+ resetState();
throw TProtocolException(TProtocolException::NEGATIVE_SIZE);
} else if (container_limit_ && sizei > container_limit_) {
+ resetState();
throw TProtocolException(TProtocolException::SIZE_LIMIT);
}
size = (uint32_t)sizei;
@@ -473,8 +598,10 @@
int32_t sizei;
xfer += subReadI32(sizei);
if (sizei < 0) {
+ resetState();
throw TProtocolException(TProtocolException::NEGATIVE_SIZE);
} else if (container_limit_ && sizei > container_limit_) {
+ resetState();
throw TProtocolException(TProtocolException::SIZE_LIMIT);
}
size = (uint32_t)sizei;
@@ -500,8 +627,10 @@
int32_t sizei;
xfer += subReadI32(sizei);
if (sizei < 0) {
+ resetState();
throw TProtocolException(TProtocolException::NEGATIVE_SIZE);
} else if (container_limit_ && sizei > container_limit_) {
+ resetState();
throw TProtocolException(TProtocolException::SIZE_LIMIT);
}
size = (uint32_t)sizei;
@@ -535,9 +664,10 @@
checkTType(T_I16);
stateTransition();
uint64_t u64;
- uint32_t rv = vliRead(u64);
+ uint32_t rv = vlqRead(u64);
int64_t val = (int64_t)u64;
if (UNLIKELY(val > INT16_MAX || val < INT16_MIN)) {
+ resetState();
throw TProtocolException(TProtocolException::INVALID_DATA,
"i16 out of range.");
}
@@ -549,9 +679,10 @@
checkTType(T_I32);
stateTransition();
uint64_t u64;
- uint32_t rv = vliRead(u64);
+ uint32_t rv = vlqRead(u64);
int64_t val = (int64_t)u64;
if (UNLIKELY(val > INT32_MAX || val < INT32_MIN)) {
+ resetState();
throw TProtocolException(TProtocolException::INVALID_DATA,
"i32 out of range.");
}
@@ -563,9 +694,10 @@
checkTType(T_I64);
stateTransition();
uint64_t u64;
- uint32_t rv = vliRead(u64);
+ uint32_t rv = vlqRead(u64);
int64_t val = (int64_t)u64;
if (UNLIKELY(val > INT64_MAX || val < INT64_MIN)) {
+ resetState();
throw TProtocolException(TProtocolException::INVALID_DATA,
"i64 out of range.");
}
@@ -587,9 +719,10 @@
uint32_t TDenseProtocol::subReadI32(int32_t& i32) {
uint64_t u64;
- uint32_t rv = vliRead(u64);
+ uint32_t rv = vlqRead(u64);
int64_t val = (int64_t)u64;
if (UNLIKELY(val > INT32_MAX || val < INT32_MIN)) {
+ resetState();
throw TProtocolException(TProtocolException::INVALID_DATA,
"i32 out of range.");
}