From: Jens Geyer Date: Mon, 24 Dec 2012 09:32:58 +0000 (+0100) Subject: THRIFT-1800 Documentation text not always escaped correctly when rendered to HTML X-Git-Tag: 0.9.1~218 X-Git-Url: https://source.supwisdom.com/gerrit/gitweb?a=commitdiff_plain;h=63e3c6307806f58a0325a1fe895e7c7f6b73d6f3;p=common%2Fthrift.git THRIFT-1800 Documentation text not always escaped correctly when rendered to HTML Patch: Jens Geyer --- diff --git a/compiler/cpp/src/generate/t_html_generator.cc b/compiler/cpp/src/generate/t_html_generator.cc index 59993eab..6838706d 100644 --- a/compiler/cpp/src/generate/t_html_generator.cc +++ b/compiler/cpp/src/generate/t_html_generator.cc @@ -32,6 +32,8 @@ using namespace std; +enum input_type { INPUT_UNKNOWN, INPUT_UTF8, INPUT_PLAIN }; + /** * HTML code generator * @@ -48,6 +50,7 @@ class t_html_generator : public t_generator { (void) parsed_options; (void) option_string; out_dir_base_ = "gen-html"; + input_type_ = INPUT_UNKNOWN; std::map::const_iterator iter; iter = parsed_options.find("standalone"); @@ -67,10 +70,13 @@ class t_html_generator : public t_generator { void generate_program_toc_rows(t_program* tprog, std::vector& finished); void generate_index(); + std::string escape_html(std::string const & str); void generate_css(); void generate_css_content(std::ofstream & f_target); void generate_style_tag(); std::string make_file_link( std::string name); + bool is_utf8_sequence(std::string const & str, size_t firstpos); + void detect_input_encoding(std::string const & str, size_t firstpos); /** * Program-level generation functions @@ -91,10 +97,12 @@ class t_html_generator : public t_generator { private: std::ofstream f_out_; std::string current_file_; + input_type input_type_; bool standalone_; }; + /** * Emits the Table of Contents links at the top of the module's page */ @@ -301,7 +309,7 @@ void t_html_generator::generate_program() { f_out_ << "" << endl; f_out_.close(); - + generate_index(); generate_css(); } @@ -383,18 +391,146 @@ void t_html_generator::print_doc(t_doc* tdoc) { size_t index; while ((index = doc.find_first_of("\r\n")) != string::npos) { if (index == 0) { - f_out_ << "

" << endl; + f_out_ << "

" << endl; } else { - f_out_ << doc.substr(0, index) << endl; + f_out_ << escape_html( doc.substr(0, index)) << endl; } if (index + 1 < doc.size() && doc.at(index) != doc.at(index + 1) && - (doc.at(index + 1) == '\r' || doc.at(index + 1) == '\n')) { - index++; + (doc.at(index + 1) == '\r' || doc.at(index + 1) == '\n')) { + index++; } doc = doc.substr(index + 1); } - f_out_ << doc << "
"; + f_out_ << escape_html(doc) << "
"; + } +} + +bool t_html_generator::is_utf8_sequence(std::string const & str, size_t firstpos) { + // leading char determines the length of the sequence + unsigned char c = str.at(firstpos); + int count = 0; + if( (c & 0xE0) == 0xC0) { + count = 1; + } else if( (c & 0xF0) == 0xE0) { + count = 2; + } else if( (c & 0xF8) == 0xF0) { + count = 3; + } else if( (c & 0xFC) == 0xF8) { + count = 4; + } else if( (c & 0xFE) == 0xFC) { + count = 5; + } else { + //pdebug("UTF-8 test: char '%c' (%d) is not a valid UTF-8 leading byte", c, int(c)); + return false; // no UTF-8 + } + + // following chars + size_t pos = firstpos + 1; + while( (pos < str.length()) && (0 < count)) + { + c = str.at(pos); + if( (c & 0xC0) != 0x80) { + //pdebug("UTF-8 test: char '%c' (%d) is not a valid UTF-8 following byte", c, int(c)); + return false; // no UTF-8 + } + --count; + ++pos; + } + + // true if the sequence is complete + return (0 == count); +} + +void t_html_generator::detect_input_encoding(std::string const & str, size_t firstpos) { + if( is_utf8_sequence(str,firstpos)) + { + pdebug( "Input seems to be already UTF-8 encoded"); + input_type_ = INPUT_UTF8; + return; } + + // fallback + pwarning( 1, "Input is not UTF-8, treating as plain ANSI"); + input_type_ = INPUT_PLAIN; +} + +std::string t_html_generator::escape_html(std::string const & str) { + + // the generated HTML header says it is UTF-8 encoded + // if UTF-8 input has been detected before, we don't need to change anything + if( input_type_ == INPUT_UTF8) { + return str; + } + + // convert unsafe chars to their &#; equivalent + std::ostringstream result; + unsigned char c = '?'; + unsigned int ic = 0; + size_t lastpos; + size_t firstpos = 0; + while( firstpos < str.length()) { + + // look for non-ASCII char + lastpos = firstpos; + while( lastpos < str.length()) { + c = str.at(lastpos); + ic = c; + if( (32 > ic) || (127 < ic)) { + break; + } + ++lastpos; + } + + // copy what we got so far + if( lastpos > firstpos) { + result << str.substr( firstpos, lastpos-firstpos); + firstpos = lastpos; + } + + // some control code? + if( (0 <= ic) && (31 >= ic)) + { + switch( c) + { + case '\r' : + case '\n' : + result << "
"; + break; + case '\t' : + result << " "; + break; + } + ++firstpos; + continue; + } + + // reached the end? + if( firstpos >= str.length()) { + break; + } + + // try to detect input encoding + if( input_type_ == INPUT_UNKNOWN) { + detect_input_encoding(str,firstpos); + if( input_type_ == INPUT_UTF8) { + lastpos = str.length(); + result << str.substr( firstpos, lastpos-firstpos); + break; + } + } + + // convert the character to something useful based on the detected encoding + switch( input_type_) { + case INPUT_PLAIN: + result << "&#" << ic << ";"; + ++firstpos; + break; + default: + throw "Unexpected or unrecognized input encoding"; + } + } + + return result.str(); } /** @@ -460,7 +596,7 @@ void t_html_generator::print_const_value(t_const_value* tvalue) { f_out_ << tvalue->get_double(); break; case t_const_value::CV_STRING: - f_out_ << '"' << get_escaped_string(tvalue) << '"'; + f_out_ << '"' << escape_html( get_escaped_string(tvalue)) << '"'; break; case t_const_value::CV_MAP: { @@ -521,7 +657,7 @@ void t_html_generator::print_fn_args_doc(t_function* tfunction) { for ( ; arg_iter != args.end(); arg_iter++) { f_out_ << "" << (*arg_iter)->get_name(); f_out_ << ""; - f_out_ << (*arg_iter)->get_doc(); + f_out_ << escape_html( (*arg_iter)->get_doc()); f_out_ << "" << endl; } f_out_ << ""; @@ -545,7 +681,7 @@ void t_html_generator::print_fn_args_doc(t_function* tfunction) { for ( ; ex_iter != excepts.end(); ex_iter++) { f_out_ << "" << (*ex_iter)->get_type()->get_name(); f_out_ << ""; - f_out_ << (*ex_iter)->get_doc(); + f_out_ << escape_html( (*ex_iter)->get_doc()); f_out_ << "" << endl; } f_out_ << ""; @@ -640,7 +776,7 @@ void t_html_generator::generate_struct(t_struct* tstruct) { f_out_ << ""; print_type((*mem_iter)->get_type()); f_out_ << ""; - f_out_ << (*mem_iter)->get_doc(); + f_out_ << escape_html( (*mem_iter)->get_doc()); f_out_ << ""; if ((*mem_iter)->get_req() == t_field::T_OPTIONAL) { f_out_ << "optional";