From ce161a96cddbea8916d0b20406b12d96772482ed Mon Sep 17 00:00:00 2001 From: David Reiss Date: Tue, 11 Sep 2007 22:09:42 +0000 Subject: [PATCH] Thrift: Clean up and test TDenseProtocol Summary: - TDenseProtocol now includes a part of the struct fingerprint in the serialized message, to protect from unserialzing trash. - A lot of cleanups and commenting for TDenseProtocol. - A lot of test cases for same. Reviewed By: mcslee Test Plan: test/DenseProtoTest.cpp Revert Plan: ok git-svn-id: https://svn.apache.org/repos/asf/incubator/thrift/trunk@665257 13f79535-47bb-0310-9956-ffa450edef68 --- compiler/cpp/src/generate/t_cpp_generator.cc | 27 +- compiler/cpp/src/generate/t_cpp_generator.h | 3 +- lib/cpp/src/TReflectionLocal.h | 41 ++- lib/cpp/src/protocol/TDenseProtocol.cpp | 369 +++++++++++++------ lib/cpp/src/protocol/TDenseProtocol.h | 76 ++-- test/DenseProtoTest.cpp | 212 ++++++++++- test/OptionalRequiredTest.thrift | 9 + 7 files changed, 550 insertions(+), 187 deletions(-) diff --git a/compiler/cpp/src/generate/t_cpp_generator.cc b/compiler/cpp/src/generate/t_cpp_generator.cc index 4a6befdc..6de7452b 100644 --- a/compiler/cpp/src/generate/t_cpp_generator.cc +++ b/compiler/cpp/src/generate/t_cpp_generator.cc @@ -361,6 +361,7 @@ void t_cpp_generator::generate_cpp_struct(t_struct* tstruct, bool is_exception) generate_struct_fingerprint(f_types_impl_, tstruct, true); generate_local_reflection(f_types_, tstruct, false); generate_local_reflection(f_types_impl_, tstruct, true); + generate_local_reflection_pointer(f_types_impl_, tstruct); generate_struct_reader(f_types_impl_, tstruct); generate_struct_writer(f_types_impl_, tstruct); } @@ -576,7 +577,7 @@ void t_cpp_generator::generate_struct_fingerprint(ofstream& out, indent() << stat << "char* " << nspace << "ascii_fingerprint" << comment << "= \"" << tstruct->get_ascii_fingerprint() << "\";" << endl << - indent() << stat << "char " << nspace << + indent() << stat << "uint8_t " << nspace << "binary_fingerprint[" << t_type::fingerprint_len << "]" << comment << "= {"; char* comma = ""; for (int i = 0; i < t_type::fingerprint_len; i++) { @@ -681,7 +682,7 @@ void t_cpp_generator::generate_local_reflection(std::ofstream& out, if (ttype->is_struct()) { out << "," << endl << - indent() << ((t_struct*)ttype)->get_members().size() << "," << endl << + indent() << type_name(ttype) << "::binary_fingerprint," << endl << indent() << local_reflection_name("metas", ttype) << "," << endl << indent() << local_reflection_name("specs", ttype); } else if (ttype->is_list()) { @@ -701,16 +702,22 @@ void t_cpp_generator::generate_local_reflection(std::ofstream& out, out << ");" << endl << endl; indent_down(); +} - // If this is a struct and we are in the implementaion file, - // also set the class's static pointer to its reflection. - if (ttype->is_struct() && is_definition) { - indent(out) << - "facebook::thrift::reflection::local::TypeSpec* " << - ttype->get_name() << "::local_reflection = " << endl << - indent() << " &" << local_reflection_name("typespec", ttype) << ";" << - endl << endl; +/** + * Writes the structure's static pointer to its local reflection typespec + * into the implementation file. + */ +void t_cpp_generator::generate_local_reflection_pointer(std::ofstream& out, + t_type* ttype) { + if (!gen_dense_) { + return; } + indent(out) << + "facebook::thrift::reflection::local::TypeSpec* " << + ttype->get_name() << "::local_reflection = " << endl << + indent() << " &" << local_reflection_name("typespec", ttype) << ";" << + endl << endl; } /** diff --git a/compiler/cpp/src/generate/t_cpp_generator.h b/compiler/cpp/src/generate/t_cpp_generator.h index 70549904..008140a2 100644 --- a/compiler/cpp/src/generate/t_cpp_generator.h +++ b/compiler/cpp/src/generate/t_cpp_generator.h @@ -149,8 +149,9 @@ class t_cpp_generator : public t_oop_generator { std::string type_to_enum(t_type* ttype); std::string local_reflection_name(const char*, t_type* ttype); - // This handles checking gen_dense_ and checking for duplicates. + // These handles checking gen_dense_ and checking for duplicates. void generate_local_reflection(std::ofstream& out, t_type* ttype, bool is_definition); + void generate_local_reflection_pointer(std::ofstream& out, t_type* ttype); bool is_complex_type(t_type* ttype) { ttype = get_true_type(ttype); diff --git a/lib/cpp/src/TReflectionLocal.h b/lib/cpp/src/TReflectionLocal.h index df6e7067..3fe7c634 100644 --- a/lib/cpp/src/TReflectionLocal.h +++ b/lib/cpp/src/TReflectionLocal.h @@ -8,35 +8,41 @@ #define _THRIFT_TREFLECTIONLOCAL_H_ 1 #include +#include #include -namespace facebook { namespace thrift { namespace reflection { namespace local { - -using facebook::thrift::protocol::TType; - /** - * A local reflection is a representation of a Thrift structure. + * Local Reflection is a blanket term referring to the the structure + * and generation of this particular representation of Thrift types. * (It is called local because it cannot be serialized by Thrift). * * @author David Reiss */ +namespace facebook { namespace thrift { namespace reflection { namespace local { + +using facebook::thrift::protocol::TType; + +// We include this many bytes of the structure's fingerprint when serializing +// a top-level structure. Long enough to make collisions unlikely, short +// enough to not significantly affect the amount of memory used. +const int FP_PREFIX_LEN = 4; + struct FieldMeta { int16_t tag; bool is_optional; }; struct TypeSpec { + TType ttype; + uint8_t fp_prefix[FP_PREFIX_LEN]; + // Use an anonymous union here so we can fit two TypeSpecs in one cache line. union { struct { // Use parallel arrays here for denser packing (of the arrays). FieldMeta* metas; TypeSpec** specs; - // n_fields is only used for debugging, but it should only add - // a minimimal amount to the effective size of this structure - // because of alignment restrictions. - int n_fields; } tstruct; struct { TypeSpec *subtype1; @@ -44,21 +50,21 @@ struct TypeSpec { } tcontainer; }; - // Put this at the end so the 32-bit enum can be crammed up next to the - // 32-bit int (n_fields). - TType ttype; - - // Static initialization of unions isn't really possible, // so take the plunge and use constructors. // Hopefully they'll be evaluated at compile time. - TypeSpec(TType ttype) : ttype(ttype) {} + TypeSpec(TType ttype) : ttype(ttype) { + std::memset(fp_prefix, 0, FP_PREFIX_LEN); + } - TypeSpec(TType ttype, int n_fields, FieldMeta* metas, TypeSpec** specs) : + TypeSpec(TType ttype, + uint8_t* fingerprint, + FieldMeta* metas, + TypeSpec** specs) : ttype(ttype) { - tstruct.n_fields = n_fields; + std::memcpy(fp_prefix, fingerprint, FP_PREFIX_LEN); tstruct.metas = metas; tstruct.specs = specs; } @@ -66,6 +72,7 @@ struct TypeSpec { TypeSpec(TType ttype, TypeSpec* subtype1, TypeSpec* subtype2) : ttype(ttype) { + std::memset(fp_prefix, 0, FP_PREFIX_LEN); tcontainer.subtype1 = subtype1; tcontainer.subtype2 = subtype2; } diff --git a/lib/cpp/src/protocol/TDenseProtocol.cpp b/lib/cpp/src/protocol/TDenseProtocol.cpp index 99117d64..e79d4f1d 100644 --- a/lib/cpp/src/protocol/TDenseProtocol.cpp +++ b/lib/cpp/src/protocol/TDenseProtocol.cpp @@ -4,20 +4,95 @@ // See accompanying file LICENSE or visit the Thrift site at: // http://developers.facebook.com/thrift/ +/* + +IMPLEMENTATION DETAILS + +TDenseProtocol was designed to have a smaller serialized form than +TBinaryProtocol. This is accomplished using two techniques. The first is +variable-length integer encoding. We use the same technique that the Standard +MIDI File format uses for "variable-length quantities" +(http://en.wikipedia.org/wiki/Variable-length_quantity). +All integers (including i16, but not byte) are first cast to uint64_t, +then written out as variable-length quantities. This has the unfortunate side +effect that all negative numbers require 10 bytes, but negative numbers tend +to be far less common than positive ones. + +The second technique eliminating the field ids used by TBinaryProtocol. This +decision required support from the Thrift compiler and also sacrifices some of +the backward and forward compatibility of TBinaryProtocol. + +We considered implementing this technique by generating separate readers and +writers for the dense protocol (this is how Pillar, Thrift's predecessor, +worked), but this idea had a few problems: +- Our abstractions go out the window. +- We would have to maintain a second code generator. +- Preserving compatibility with old versions of the structures would be a + nightmare. + +Therefore, we chose an alternate implementation that stored the description of +the data neither in the data itself (like TBinaryProtocol) nor in the +serialization code (like Pillar), but instead in a separate data structure, +called a TypeSpec. TypeSpecs are generated by the Thrift compiler +(specifically in the t_cpp_generator), and their structure should be +documented there (TODO(dreiss): s/should be/is/). + +We maintain a stack of TypeSpecs within the protocol so it knows where the +generated code is in the reading/writing process. For example, if we are +writing an i32 contained in a struct bar, contained in a struct foo, then the +stack would look like: TOP , i32 , struct bar , struct foo , BOTTOM. +The following invariant: whenever we are about to read/write an object +(structBegin, containerBegin, or a scalar), the TypeSpec on the top of the +stack must match the type being read/written. The main reasons that this +invariant must be maintained is that if we ever start reading a structure, we +must have its exact TypeSpec in order to pass the right tags to the +deserializer. + +We use the following strategies for maintaining this invariant: + +- For structures, we have a separate stack of indexes, one for each structure + on the TypeSpec stack. These are indexes into the list of fields in the + structure's TypeSpec. When we {read,write}FieldBegin, we push on the + TypeSpec for the field. +- When we begin writing a list or set, we push on the TypeSpec for the + element type. +- For maps, we have a separate stack of booleans, one for each map on the + TypeSpec stack. The boolean is true if we are writing the key for that + map, and false if we are writing the value. Maps are the trickiest case + because the generated code does not call any protocol method between + the key and the value. As a result, we potentially have to switch + between map key state and map value state after reading/writing any object. +- This job is handled by the stateTransition method. It is called after + reading/writing every object. It pops the current TypeSpec off the stack, + then optionally pushes a new one on, depending on what the next TypeSpec is. + If it is a struct, the job is left to the next writeFieldBegin. If it is a + set or list, the just-popped typespec is pushed back on. If it is a map, + the top of the key/value stack is toggled, and the appropriate TypeSpec + is pushed. + +Optional fields are a little tricky also. We write a zero byte if they are +absent and prefix them with an 0x01 byte if they are present +*/ + #define __STDC_LIMIT_MACROS #include #include "TDenseProtocol.h" #include "TReflectionLocal.h" -// XXX for debugging (duh) +// Leaving this on for now. Disabling it will turn off asserts, which should +// give a performance boost. When we have *really* thorough test cases, +// we should drop this. #define DEBUG_TDENSEPROTOCOL -// XXX for development. -#define TDENSE_PROTOCOL_MEASURE_VLI - -// The XXX above does not apply to this. +// NOTE: Assertions should *only* be used to detect bugs in code, +// either in TDenseProtocol itself, or in code using it. +// (For example, using the wrong TypeSpec.) +// Invalid data should NEVER cause an assertion failure, +// no matter how grossly corrupted, nor how ingeniously crafted. #ifdef DEBUG_TDENSEPROTOCOL #undef NDEBUG +#else +#define NDEBUG #endif #include @@ -31,6 +106,9 @@ using std::string; namespace facebook { namespace thrift { namespace protocol { +const int TDenseProtocol::FP_PREFIX_LEN = + facebook::thrift::reflection::local::FP_PREFIX_LEN; + // Top TypeSpec. TypeSpec of the structure being encoded. #define TTS (ts_stack_.back()) // type = TypeSpec* // InDeX. Index into TTS of the current/next field to encode. @@ -44,15 +122,25 @@ namespace facebook { namespace thrift { namespace protocol { #define ST2 (TTS->tcontainer.subtype2) +/** + * Checks that @c ttype is indeed the ttype that we should be writing, + * according to our typespec. Aborts if the test fails and debugging in on. + */ inline void TDenseProtocol::checkTType(const TType ttype) { assert(!ts_stack_.empty()); assert(TTS->ttype == ttype); } +/** + * Makes sure that the TypeSpec stack is correct for the next object. + * See top-of-file comments. + */ inline void TDenseProtocol::stateTransition() { TypeSpec* old_tts = ts_stack_.back(); ts_stack_.pop_back(); + // If this is the end of the top-level write, we should have just popped + // the TypeSpec passed to the constructor. if (ts_stack_.empty()) { assert(old_tts = type_spec_); return; @@ -83,6 +171,84 @@ inline void TDenseProtocol::stateTransition() { } } + +/* + * Variable-length quantity functions. + */ + +inline uint32_t TDenseProtocol::vlqRead(uint64_t& vlq) { + uint32_t used = 0; + uint64_t val = 0; + uint8_t buf[10]; // 64 bits / (7 bits/byte) = 10 bytes. + bool borrowed = trans_->borrow(buf, sizeof(buf)); + + // Fast path. TODO(dreiss): Make it faster. + if (borrowed) { + while (true) { + uint8_t byte = buf[used]; + used++; + val = (val << 7) | (byte & 0x7f); + if (!(byte & 0x80)) { + vlq = val; + trans_->consume(used); + return used; + } + // Have to check for invalid data so we don't crash. + if (UNLIKELY(used == sizeof(buf))) { + resetState(); + throw TProtocolException(TProtocolException::INVALID_DATA, "Variable-length int over 10 bytes."); + } + } + } + + // Slow path. + else { + while (true) { + uint8_t byte; + used += trans_->readAll(&byte, 1); + val = (val << 7) | (byte & 0x7f); + if (!(byte & 0x80)) { + vlq = val; + return used; + } + // Might as well check for invalid data on the slow path too. + if (UNLIKELY(used >= sizeof(buf))) { + resetState(); + throw TProtocolException(TProtocolException::INVALID_DATA, "Variable-length int over 10 bytes."); + } + } + } +} + +inline uint32_t TDenseProtocol::vlqWrite(uint64_t vlq) { + uint8_t buf[10]; // 64 bits / (7 bits/byte) = 10 bytes. + int32_t pos = sizeof(buf) - 1; + + // Write the thing from back to front. + buf[pos] = vlq & 0x7f; + vlq >>= 7; + pos--; + + while (vlq > 0) { + assert(pos >= 0); + buf[pos] = (vlq | 0x80); + vlq >>= 7; + pos--; + } + + // Back up one step before writing. + pos++; + + trans_->write(buf+pos, sizeof(buf) - pos); + return sizeof(buf) - pos; +} + + + +/* + * Writing functions. + */ + uint32_t TDenseProtocol::writeMessageBegin(const std::string& name, const TMessageType messageType, const int32_t seqid) { @@ -100,16 +266,27 @@ uint32_t TDenseProtocol::writeMessageEnd() { return 0; } -// Also implements readStructBegin. uint32_t TDenseProtocol::writeStructBegin(const string& name) { + uint32_t xfer = 0; + + // The TypeSpec stack should be empty if this is the top-level read/write. + // If it is, we push the TypeSpec passed to the constructor. if (ts_stack_.empty()) { + assert(standalone_); + if (type_spec_ == NULL) { + resetState(); throw TApplicationException("TDenseProtocol: No type specified."); } else { + assert(type_spec_->ttype == T_STRUCT); ts_stack_.push_back(type_spec_); + // Write out a prefix of the structure fingerprint. + trans_->write(type_spec_->fp_prefix, FP_PREFIX_LEN); + xfer += FP_PREFIX_LEN; } } + // We need a new field index for this structure. idx_stack_.push_back(0); return 0; } @@ -125,11 +302,14 @@ uint32_t TDenseProtocol::writeFieldBegin(const string& name, const int16_t fieldId) { uint32_t xfer = 0; + // Skip over optional fields. while (FMT.tag != fieldId) { // TODO(dreiss): Old meta here. assert(FTS->ttype != T_STOP); assert(FMT.is_optional); + // Write a zero byte so the reader can skip it. xfer += subWriteBool(false); + // And advance to the next field. IDX++; } @@ -141,20 +321,24 @@ uint32_t TDenseProtocol::writeFieldBegin(const string& name, xfer += 1; } - // OMG I'm so gross. XXX - if (FTS->ttype != T_STOP) { + // writeFieldStop shares all lot of logic up to this point. + // Instead of replicating it all, we just call this method from that one + // and use a gross special case here. + if (UNLIKELY(FTS->ttype != T_STOP)) { + // For normal fields, push the TypeSpec that we're about to use. ts_stack_.push_back(FTS); } return xfer; } uint32_t TDenseProtocol::writeFieldEnd() { + // Just move on to the next field. IDX++; return 0; } uint32_t TDenseProtocol::writeFieldStop() { - return writeFieldBegin("", T_STOP, 0); + return TDenseProtocol::writeFieldBegin("", T_STOP, 0); } uint32_t TDenseProtocol::writeMapBegin(const TType keyType, @@ -172,6 +356,8 @@ uint32_t TDenseProtocol::writeMapBegin(const TType keyType, } uint32_t TDenseProtocol::writeMapEnd() { + // Pop off the value type, as well as our entry in the map key/value stack. + // stateTransition takes care of popping off our TypeSpec. ts_stack_.pop_back(); mkv_stack_.pop_back(); stateTransition(); @@ -188,6 +374,7 @@ uint32_t TDenseProtocol::writeListBegin(const TType elemType, } uint32_t TDenseProtocol::writeListEnd() { + // Pop off the element type. stateTransition takes care of popping off ours. ts_stack_.pop_back(); stateTransition(); return 0; @@ -203,6 +390,7 @@ uint32_t TDenseProtocol::writeSetBegin(const TType elemType, } uint32_t TDenseProtocol::writeSetEnd() { + // Pop off the element type. stateTransition takes care of popping off ours. ts_stack_.pop_back(); stateTransition(); return 0; @@ -223,46 +411,19 @@ uint32_t TDenseProtocol::writeByte(const int8_t byte) { uint32_t TDenseProtocol::writeI16(const int16_t i16) { checkTType(T_I16); stateTransition(); - - uint32_t rv = vliWrite(i16); -#ifdef TDENSE_PROTOCOL_MEASURE_VLI - vli_save_16 += 2 - rv; - if (i16 < 0) { - negs++; - } -#endif - - return rv; + return vlqWrite(i16); } uint32_t TDenseProtocol::writeI32(const int32_t i32) { checkTType(T_I32); stateTransition(); - - uint32_t rv = vliWrite(i32); -#ifdef TDENSE_PROTOCOL_MEASURE_VLI - vli_save_32 += 4 - rv; - if (i32 < 0) { - negs++; - } -#endif - - return rv; + return vlqWrite(i32); } uint32_t TDenseProtocol::writeI64(const int64_t i64) { checkTType(T_I64); stateTransition(); - - uint32_t rv = vliWrite(i64); -#ifdef TDENSE_PROTOCOL_MEASURE_VLI - vli_save_64 += 8 - rv; - if (i64 < 0) { - negs++; - } -#endif - - return rv; + return vlqWrite(i64); } uint32_t TDenseProtocol::writeDouble(const double dub) { @@ -278,14 +439,7 @@ uint32_t TDenseProtocol::writeString(const std::string& str) { } inline uint32_t TDenseProtocol::subWriteI32(const int32_t i32) { - uint32_t rv = vliWrite(i32); -#ifdef TDENSE_PROTOCOL_MEASURE_VLI - vli_save_sub += 4 - rv; - if (i32 < 0) { - negs++; - } -#endif - return rv; + return vlqWrite(i32); } uint32_t TDenseProtocol::subWriteString(const std::string& str) { @@ -297,73 +451,13 @@ uint32_t TDenseProtocol::subWriteString(const std::string& str) { return xfer + size; } -inline uint32_t TDenseProtocol::vliRead(uint64_t& vli) { - uint32_t used = 0; - uint64_t val = 0; - uint8_t buf[10]; // 64 bits / (7 bits/byte) = 10 bytes. - bool borrowed = trans_->borrow(buf, sizeof(buf)); - - // Fast path. TODO(dreiss): Make it faster. - if (borrowed) { - while (true) { - uint8_t byte = buf[used]; - used++; - val = (val << 7) | (byte & 0x7f); - if (!(byte & 0x80)) { - vli = val; - trans_->consume(used); - return used; - } - // Have to check for invalid data so we don't crash. - if (UNLIKELY(used == sizeof(buf))) { - throw TProtocolException(TProtocolException::INVALID_DATA, "Variable-length int over 10 bytes."); - } - } - } - - // Slow path. - else { - while (true) { - uint8_t byte; - used += trans_->readAll(&byte, 1); - val = (val << 7) | (byte & 0x7f); - if (!(byte & 0x80)) { - vli = val; - return used; - } - // Might as well check for invalid data on the slow path too. - if (UNLIKELY(used >= sizeof(buf))) { - throw TProtocolException(TProtocolException::INVALID_DATA, "Variable-length int over 10 bytes."); - } - } - } -} - -inline uint32_t TDenseProtocol::vliWrite(uint64_t vli) { - uint8_t buf[10]; // 64 bits / (7 bits/byte) = 10 bytes. - int32_t pos = sizeof(buf) - 1; - // Write the thing from back to front. - buf[pos] = vli & 0x7f; - vli >>= 7; - pos--; - - while (vli > 0) { - assert(pos >= 0); - buf[pos] = (vli | 0x80); - vli >>= 7; - pos--; - } - // Back up one step before writing. - pos++; - - trans_->write(buf+pos, sizeof(buf) - pos); - return sizeof(buf) - pos; -} - -/** +/* * Reading functions + * + * These have a lot of the same logic as the writing functions, so if + * something is confusing, look for comments in the corresponding writer. */ uint32_t TDenseProtocol::readMessageBegin(std::string& name, @@ -395,8 +489,32 @@ uint32_t TDenseProtocol::readMessageEnd() { } uint32_t TDenseProtocol::readStructBegin(string& name) { - // TODO(dreiss): Any chance this gets inlined? - return TDenseProtocol::writeStructBegin(name); + uint32_t xfer = 0; + + if (ts_stack_.empty()) { + assert(standalone_); + + if (type_spec_ == NULL) { + resetState(); + throw TApplicationException("TDenseProtocol: No type specified."); + } else { + assert(type_spec_->ttype == T_STRUCT); + ts_stack_.push_back(type_spec_); + + // Check the fingerprint prefix. + uint8_t buf[FP_PREFIX_LEN]; + xfer += trans_->read(buf, FP_PREFIX_LEN); + if (std::memcmp(buf, type_spec_->fp_prefix, FP_PREFIX_LEN) != 0) { + resetState(); + throw TProtocolException(TProtocolException::INVALID_DATA, + "Fingerprint in data does not match type_spec."); + } + } + } + + // We need a new field index for this structure. + idx_stack_.push_back(0); + return 0; } uint32_t TDenseProtocol::readStructEnd() { @@ -410,6 +528,7 @@ uint32_t TDenseProtocol::readFieldBegin(string& name, int16_t& fieldId) { uint32_t xfer = 0; + // For optional fields, check to see if they are there. while (FMT.is_optional) { bool is_present; xfer += subReadBool(is_present); @@ -419,10 +538,14 @@ uint32_t TDenseProtocol::readFieldBegin(string& name, IDX++; } + // Once we hit a mandatory field, or an optional field that is present, + // we know that FMT and FTS point to the appropriate field. + fieldId = FMT.tag; fieldType = FTS->ttype; - // OMG I'm so gross. XXX + // Normally, we push the TypeSpec that we are about to read, + // but no reading is done for T_STOP. if (FTS->ttype != T_STOP) { ts_stack_.push_back(FTS); } @@ -443,8 +566,10 @@ uint32_t TDenseProtocol::readMapBegin(TType& keyType, int32_t sizei; xfer += subReadI32(sizei); if (sizei < 0) { + resetState(); throw TProtocolException(TProtocolException::NEGATIVE_SIZE); } else if (container_limit_ && sizei > container_limit_) { + resetState(); throw TProtocolException(TProtocolException::SIZE_LIMIT); } size = (uint32_t)sizei; @@ -473,8 +598,10 @@ uint32_t TDenseProtocol::readListBegin(TType& elemType, int32_t sizei; xfer += subReadI32(sizei); if (sizei < 0) { + resetState(); throw TProtocolException(TProtocolException::NEGATIVE_SIZE); } else if (container_limit_ && sizei > container_limit_) { + resetState(); throw TProtocolException(TProtocolException::SIZE_LIMIT); } size = (uint32_t)sizei; @@ -500,8 +627,10 @@ uint32_t TDenseProtocol::readSetBegin(TType& elemType, int32_t sizei; xfer += subReadI32(sizei); if (sizei < 0) { + resetState(); throw TProtocolException(TProtocolException::NEGATIVE_SIZE); } else if (container_limit_ && sizei > container_limit_) { + resetState(); throw TProtocolException(TProtocolException::SIZE_LIMIT); } size = (uint32_t)sizei; @@ -535,9 +664,10 @@ uint32_t TDenseProtocol::readI16(int16_t& i16) { checkTType(T_I16); stateTransition(); uint64_t u64; - uint32_t rv = vliRead(u64); + uint32_t rv = vlqRead(u64); int64_t val = (int64_t)u64; if (UNLIKELY(val > INT16_MAX || val < INT16_MIN)) { + resetState(); throw TProtocolException(TProtocolException::INVALID_DATA, "i16 out of range."); } @@ -549,9 +679,10 @@ uint32_t TDenseProtocol::readI32(int32_t& i32) { checkTType(T_I32); stateTransition(); uint64_t u64; - uint32_t rv = vliRead(u64); + uint32_t rv = vlqRead(u64); int64_t val = (int64_t)u64; if (UNLIKELY(val > INT32_MAX || val < INT32_MIN)) { + resetState(); throw TProtocolException(TProtocolException::INVALID_DATA, "i32 out of range."); } @@ -563,9 +694,10 @@ uint32_t TDenseProtocol::readI64(int64_t& i64) { checkTType(T_I64); stateTransition(); uint64_t u64; - uint32_t rv = vliRead(u64); + uint32_t rv = vlqRead(u64); int64_t val = (int64_t)u64; if (UNLIKELY(val > INT64_MAX || val < INT64_MIN)) { + resetState(); throw TProtocolException(TProtocolException::INVALID_DATA, "i64 out of range."); } @@ -587,9 +719,10 @@ uint32_t TDenseProtocol::readString(std::string& str) { uint32_t TDenseProtocol::subReadI32(int32_t& i32) { uint64_t u64; - uint32_t rv = vliRead(u64); + uint32_t rv = vlqRead(u64); int64_t val = (int64_t)u64; if (UNLIKELY(val > INT32_MAX || val < INT32_MIN)) { + resetState(); throw TProtocolException(TProtocolException::INVALID_DATA, "i32 out of range."); } diff --git a/lib/cpp/src/protocol/TDenseProtocol.h b/lib/cpp/src/protocol/TDenseProtocol.h index edd5c886..0902e15f 100644 --- a/lib/cpp/src/protocol/TDenseProtocol.h +++ b/lib/cpp/src/protocol/TDenseProtocol.h @@ -12,13 +12,30 @@ namespace facebook { namespace thrift { namespace protocol { /** - * The dense protocol is designed to use as little space as possible. - * * !!!WARNING!!! * This class is still highly experimental. Incompatible changes * WILL be made to it without notice. DO NOT USE IT YET unless * you are coordinating your testing with the author. * + * The dense protocol is designed to use as little space as possible. + * + * There are two types of dense protocol instances. Standalone instances + * are not used for RPC and just encoded and decode structures of + * a predetermined type. Non-standalone instances are used for RPC. + * Currently, only standalone instances exist. + * + * To use a standalone dense protocol object, you must set the type_spec + * property (either in the constructor, or with setTypeSpec) to the local + * reflection TypeSpec of the structures you will write to (or read from) the + * protocol instance. + * + * BEST PRACTICES: + * - Never use optional for primitives or containers. + * - Only use optional for structures if they are very big and very rarely set. + * - All integers are variable-length, so you can use i64 without bloating. + * - NEVER EVER change the struct definitions IN ANY WAY without either + * changing your cache keys or talking to dreiss. + * * TODO(dreiss): New class write with old meta. * * We override all of TBinaryProtocol's methods. @@ -35,34 +52,18 @@ class TDenseProtocol : public TBinaryProtocol { public: typedef facebook::thrift::reflection::local::TypeSpec TypeSpec; + static const int FP_PREFIX_LEN; + /** + * @param tran The transport to use. + * @param type_spec The TypeSpec of the structures using this protocol. + */ TDenseProtocol(boost::shared_ptr trans, TypeSpec* type_spec = NULL) : TBinaryProtocol(trans), - type_spec_(type_spec) { - vli_save_16 = 0; - vli_save_32 = 0; - vli_save_64 = 0; - vli_save_sub = 0; - negs = 0; - } - - TDenseProtocol(boost::shared_ptr trans, - TypeSpec* type_spec, - int32_t string_limit, - int32_t container_limit) : - TBinaryProtocol(trans, - string_limit, - container_limit, - true, - true), - type_spec_(type_spec) { - vli_save_16 = 0; - vli_save_32 = 0; - vli_save_64 = 0; - vli_save_sub = 0; - negs = 0; - } + type_spec_(type_spec), + standalone_(true) + {} void setTypeSpec(TypeSpec* type_spec) { type_spec_ = type_spec; @@ -203,14 +204,24 @@ class TDenseProtocol : public TBinaryProtocol { private: + // Implementation functions, documented in the .cpp. inline void checkTType(const TType ttype); inline void stateTransition(); // Read and write variable-length integers. // Uses the same technique as the MIDI file format. - inline uint32_t vliRead(uint64_t& vli); - inline uint32_t vliWrite(uint64_t vli); + inline uint32_t vlqRead(uint64_t& vlq); + inline uint32_t vlqWrite(uint64_t vlq); + + // Called before throwing an exception to make the object reusable. + void resetState() { + ts_stack_.clear(); + idx_stack_.clear(); + mkv_stack_.clear(); + } + // TypeSpec of the top-level structure to write, + // for standalone protocol objects. TypeSpec* type_spec_; std::vector ts_stack_; // TypeSpec stack. @@ -218,13 +229,8 @@ class TDenseProtocol : public TBinaryProtocol { std::vector mkv_stack_; // Map Key/Vlue stack. // True = key, False = value. - // XXX Remove these when wire format is finalized. - public: - int vli_save_16; - int vli_save_32; - int vli_save_64; - int vli_save_sub; - int negs; + // True iff this is a standalone instance (no RPC). + bool standalone_; }; }}} // facebook::thrift::protocol diff --git a/test/DenseProtoTest.cpp b/test/DenseProtoTest.cpp index 0953c191..f50facaf 100644 --- a/test/DenseProtoTest.cpp +++ b/test/DenseProtoTest.cpp @@ -1,8 +1,10 @@ /* ../compiler/cpp/thrift -cpp -dense DebugProtoTest.thrift +../compiler/cpp/thrift -cpp -dense OptionalRequiredTest.thrift g++ -Wall -g -I../lib/cpp/src -I/usr/local/include/boost-1_33_1 \ - DenseProtoTest.cpp gen-cpp/DebugProtoTest_types.cpp \ - ../lib/cpp/.libs/libthrift.a -o DenseProtoTest + gen-cpp/OptionalRequiredTest_types.cpp \ + gen-cpp/DebugProtoTest_types.cpp \ + DenseProtoTest.cpp ../lib/cpp/.libs/libthrift.a -o DenseProtoTest ./DenseProtoTest */ @@ -11,10 +13,13 @@ g++ -Wall -g -I../lib/cpp/src -I/usr/local/include/boost-1_33_1 \ #define inline #undef NDEBUG +#include #include #include #include +#include #include "gen-cpp/DebugProtoTest_types.h" +#include "gen-cpp/OptionalRequiredTest_types.h" #include #include @@ -31,6 +36,7 @@ bool my_memeq(const char* str1, const char* str2, int len) { int main() { + using std::string; using std::cout; using std::endl; using boost::shared_ptr; @@ -123,13 +129,14 @@ int main() { // Let's test out the variable-length ints, shall we? - uint64_t vli; + uint64_t vlq; #define checkout(i, c) { \ buffer->resetBuffer(); \ - proto->vliWrite(i); \ + proto->vlqWrite(i); \ + proto->getTransport()->flush(); \ assert(my_memeq(buffer->getBufferAsString().data(), c, sizeof(c)-1)); \ - proto->vliRead(vli); \ - assert(vli == i); \ + proto->vlqRead(vlq); \ + assert(vlq == i); \ } checkout(0x00000000, "\x00"); @@ -160,6 +167,199 @@ int main() { checkout(0x7FFFFFFFFFFFFFFFull, "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x7F"); checkout(0xFFFFFFFFFFFFFFFFull, "\x81\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x7F"); + // Test out the slow path with a TBufferedTransport. + shared_ptr buff_trans(new TBufferedTransport(buffer, 3)); + proto.reset(new TDenseProtocol(buff_trans)); + checkout(0x0000000100000000ull, "\x90\x80\x80\x80\x00"); + checkout(0x0000000200000000ull, "\xA0\x80\x80\x80\x00"); + checkout(0x0000000300000000ull, "\xB0\x80\x80\x80\x00"); + checkout(0x0000000700000000ull, "\xF0\x80\x80\x80\x00"); + checkout(0x00000007F0000000ull, "\xFF\x80\x80\x80\x00"); + checkout(0x00000007FFFFFFFFull, "\xFF\xFF\xFF\xFF\x7F"); + checkout(0x0000000800000000ull, "\x81\x80\x80\x80\x80\x00"); + checkout(0x1FFFFFFFFFFFFFFFull, "\x9F\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x7F"); + checkout(0x7FFFFFFFFFFFFFFFull, "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x7F"); + checkout(0xFFFFFFFFFFFFFFFFull, "\x81\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x7F"); + + // Test optional stuff. + proto.reset(new TDenseProtocol(buffer)); + proto->setTypeSpec(ManyOpt::local_reflection); + ManyOpt mo1, mo2, mo3, mo4, mo5, mo6; + mo1.opt1 = 923759347; + mo1.opt2 = 392749274; + mo1.opt3 = 395739402; + mo1.def4 = 294730928; + mo1.opt5 = 394309218; + mo1.opt6 = 832194723; + mo1.__isset.opt1 = true; + mo1.__isset.opt2 = true; + mo1.__isset.opt3 = true; + mo1.__isset.def4 = true; + mo1.__isset.opt5 = true; + mo1.__isset.opt6 = true; + + mo1.write(proto.get()); + mo2.read(proto.get()); + + assert(mo2.__isset.opt1 == true); + assert(mo2.__isset.opt2 == true); + assert(mo2.__isset.opt3 == true); + assert(mo2.__isset.def4 == true); + assert(mo2.__isset.opt5 == true); + assert(mo2.__isset.opt6 == true); + + assert(mo1 == mo2); + + mo1.__isset.opt1 = false; + mo1.__isset.opt3 = false; + mo1.__isset.opt5 = false; + + mo1.write(proto.get()); + mo3.read(proto.get()); + + assert(mo3.__isset.opt1 == false); + assert(mo3.__isset.opt2 == true); + assert(mo3.__isset.opt3 == false); + assert(mo3.__isset.def4 == true); + assert(mo3.__isset.opt5 == false); + assert(mo3.__isset.opt6 == true); + + assert(mo1 == mo3); + + mo1.__isset.opt1 = true; + mo1.__isset.opt3 = true; + mo1.__isset.opt5 = true; + mo1.__isset.opt2 = false; + mo1.__isset.opt6 = false; + + mo1.write(proto.get()); + mo4.read(proto.get()); + + assert(mo4.__isset.opt1 == true); + assert(mo4.__isset.opt2 == false); + assert(mo4.__isset.opt3 == true); + assert(mo4.__isset.def4 == true); + assert(mo4.__isset.opt5 == true); + assert(mo4.__isset.opt6 == false); + + assert(mo1 == mo4); + + mo1.__isset.opt1 = false; + mo1.__isset.opt5 = false; + + mo1.write(proto.get()); + mo5.read(proto.get()); + + assert(mo5.__isset.opt1 == false); + assert(mo5.__isset.opt2 == false); + assert(mo5.__isset.opt3 == true); + assert(mo5.__isset.def4 == true); + assert(mo5.__isset.opt5 == false); + assert(mo5.__isset.opt6 == false); + + assert(mo1 == mo5); + + mo1.__isset.opt3 = false; + + mo1.write(proto.get()); + mo6.read(proto.get()); + + assert(mo6.__isset.opt1 == false); + assert(mo6.__isset.opt2 == false); + assert(mo6.__isset.opt3 == false); + assert(mo6.__isset.def4 == true); + assert(mo6.__isset.opt5 == false); + assert(mo6.__isset.opt6 == false); + + assert(mo1 == mo6); + + + // Test fingerprint checking stuff. + + { + // Default and required have the same fingerprint. + Tricky1 t1; + Tricky3 t3; + assert(string(Tricky1::ascii_fingerprint) == Tricky3::ascii_fingerprint); + proto->setTypeSpec(Tricky1::local_reflection); + t1.im_default = 227; + t1.write(proto.get()); + proto->setTypeSpec(Tricky3::local_reflection); + t3.read(proto.get()); + assert(t3.im_required == 227); + } + + { + // Optional changes things. + Tricky1 t1; + Tricky2 t2; + assert(string(Tricky1::ascii_fingerprint) != Tricky2::ascii_fingerprint); + proto->setTypeSpec(Tricky1::local_reflection); + t1.im_default = 227; + t1.write(proto.get()); + try { + proto->setTypeSpec(Tricky2::local_reflection); + t2.read(proto.get()); + assert(false); + } catch (TProtocolException& ex) { + buffer->resetBuffer(); + } + } + + { + // Holy cow. We can use the Tricky1 typespec with the Tricky2 structure. + Tricky1 t1; + Tricky2 t2; + proto->setTypeSpec(Tricky1::local_reflection); + t1.im_default = 227; + t1.write(proto.get()); + t2.read(proto.get()); + assert(t2.__isset.im_optional == true); + assert(t2.im_optional == 227); + } + + { + // And totally off the wall. + Tricky1 t1; + OneOfEach ooe2; + assert(string(Tricky1::ascii_fingerprint) != OneOfEach::ascii_fingerprint); + proto->setTypeSpec(Tricky1::local_reflection); + t1.im_default = 227; + t1.write(proto.get()); + try { + proto->setTypeSpec(OneOfEach::local_reflection); + ooe2.read(proto.get()); + assert(false); + } catch (TProtocolException& ex) { + buffer->resetBuffer(); + } + } + + // Okay, this is really off the wall. + // Just don't crash. + cout << "Starting fuzz test. This takes a while. (20 dots.)" << endl; + std::srand(12345); + for (int i = 0; i < 2000; i++) { + if (i % 100 == 0) { + cout << "."; + cout.flush(); + } + buffer->resetBuffer(); + // Make sure the fingerprint prefix is right. + buffer->write(Nesting::binary_fingerprint, 4); + for (int j = 0; j < 1024*1024; j++) { + uint8_t r = std::rand(); + buffer->write(&r, 1); + } + Nesting n; + proto->setTypeSpec(OneOfEach::local_reflection); + try { + n.read(proto.get()); + } catch (TProtocolException& ex) { + } catch (TTransportException& ex) { + } + } + cout << endl; return 0; } diff --git a/test/OptionalRequiredTest.thrift b/test/OptionalRequiredTest.thrift index b2652dd5..431a0b00 100644 --- a/test/OptionalRequiredTest.thrift +++ b/test/OptionalRequiredTest.thrift @@ -40,3 +40,12 @@ struct Complex { 5: required Simple req_simp; 6: optional Simple opt_simp; } + +struct ManyOpt { + 1: optional i32 opt1; + 2: optional i32 opt2; + 3: optional i32 opt3; + 4: i32 def4; + 5: optional i32 opt5; + 6: optional i32 opt6; +} -- 2.17.1