From e3ab0bf507d1f3b6e898df702869331ee8ac8e12 Mon Sep 17 00:00:00 2001 From: Jens Geyer Date: Tue, 11 Mar 2014 22:31:53 +0200 Subject: [PATCH] THRIFT-2375 Excessive
's in generated HTML Patch: Jens Geyer --- compiler/cpp/src/generate/t_html_generator.cc | 168 +++++++++++++++--- 1 file changed, 146 insertions(+), 22 deletions(-) diff --git a/compiler/cpp/src/generate/t_html_generator.cc b/compiler/cpp/src/generate/t_html_generator.cc index 701fdd10..8ac7ddb2 100644 --- a/compiler/cpp/src/generate/t_html_generator.cc +++ b/compiler/cpp/src/generate/t_html_generator.cc @@ -70,6 +70,8 @@ class t_html_generator : public t_generator { escape_['>'] = ">"; escape_['"'] = """; escape_['\''] = "'"; + + init_allowed__markup(); } void generate_program(); @@ -79,12 +81,14 @@ class t_html_generator : public t_generator { std::vector& finished); void generate_index(); std::string escape_html(std::string const & str); + std::string escape_html_tags(std::string const & str); void generate_css(); void generate_css_content(std::ofstream & f_target); void generate_style_tag(); std::string make_file_link( std::string name); bool is_utf8_sequence(std::string const & str, size_t firstpos); void detect_input_encoding(std::string const & str, size_t firstpos); + void init_allowed__markup(); /** * Program-level generation functions @@ -106,7 +110,7 @@ class t_html_generator : public t_generator { std::ofstream f_out_; std::string current_file_; input_type input_type_; - + std::map allowed_markup; bool standalone_; }; @@ -395,21 +399,7 @@ std::string t_html_generator::make_file_link( std::string filename) { */ void t_html_generator::print_doc(t_doc* tdoc) { if (tdoc->has_doc()) { - string doc = tdoc->get_doc(); - size_t index; - while ((index = doc.find_first_of("\r\n")) != string::npos) { - if (index == 0) { - f_out_ << "

" << endl; - } else { - f_out_ << escape_html( doc.substr(0, index)) << endl; - } - if (index + 1 < doc.size() && doc.at(index) != doc.at(index + 1) && - (doc.at(index + 1) == '\r' || doc.at(index + 1) == '\n')) { - index++; - } - doc = doc.substr(index + 1); - } - f_out_ << escape_html(doc) << "
"; + f_out_ << escape_html(tdoc->get_doc()) << "
"; } } @@ -462,12 +452,141 @@ void t_html_generator::detect_input_encoding(std::string const & str, size_t fir input_type_ = INPUT_PLAIN; } -std::string t_html_generator::escape_html(std::string const & str) { +void t_html_generator::init_allowed__markup() { + allowed_markup.clear(); + // standalone tags + allowed_markup["br"] = 1; + allowed_markup["br/"] = 1; + allowed_markup["img"] = 1; + // paired tags + allowed_markup["b"] = 1; + allowed_markup["/b"] = 1; + allowed_markup["u"] = 1; + allowed_markup["/u"] = 1; + allowed_markup["i"] = 1; + allowed_markup["/i"] = 1; + allowed_markup["s"] = 1; + allowed_markup["/s"] = 1; + allowed_markup["big"] = 1; + allowed_markup["/big"] = 1; + allowed_markup["small"] = 1; + allowed_markup["/small"] = 1; + allowed_markup["sup"] = 1; + allowed_markup["/sup"] = 1; + allowed_markup["sub"] = 1; + allowed_markup["/sub"] = 1; + allowed_markup["pre"] = 1; + allowed_markup["/pre"] = 1; + allowed_markup["tt"] = 1; + allowed_markup["/tt"] = 1; + allowed_markup["ul"] = 1; + allowed_markup["/ul"] = 1; + allowed_markup["ol"] = 1; + allowed_markup["/ol"] = 1; + allowed_markup["li"] = 1; + allowed_markup["/li"] = 1; + allowed_markup["a"] = 1; + allowed_markup["/a"] = 1; + allowed_markup["p"] = 1; + allowed_markup["/p"] = 1; + allowed_markup["code"] = 1; + allowed_markup["/code"] = 1; + allowed_markup["dl"] = 1; + allowed_markup["/dl"] = 1; + allowed_markup["dt"] = 1; + allowed_markup["/dt"] = 1; + allowed_markup["dd"] = 1; + allowed_markup["/dd"] = 1; + allowed_markup["h1"] = 1; + allowed_markup["/h1"] = 1; + allowed_markup["h2"] = 1; + allowed_markup["/h2"] = 1; + allowed_markup["h3"] = 1; + allowed_markup["/h3"] = 1; + allowed_markup["h4"] = 1; + allowed_markup["/h4"] = 1; + allowed_markup["h5"] = 1; + allowed_markup["/h5"] = 1; + allowed_markup["h6"] = 1; + allowed_markup["/h6"] = 1; +} + +std::string t_html_generator::escape_html_tags(std::string const & str) { + std::ostringstream result; + + unsigned char c = '?'; + size_t lastpos; + size_t firstpos = 0; + while( firstpos < str.length()) { + + // look for non-ASCII char + lastpos = firstpos; + while( lastpos < str.length()) { + c = str.at(lastpos); + if( ('<' == c) || ('>' == c)) { + break; + } + ++lastpos; + } + + // copy what we got so far + if( lastpos > firstpos) { + result << str.substr( firstpos, lastpos-firstpos); + firstpos = lastpos; + } + + // reached the end? + if( firstpos >= str.length()) { + break; + } + + // tag end without corresponding begin + ++firstpos; + if( '>' == c) { + result << ">"; + continue; + } + + // extract the tag + std::ostringstream tagstream; + while( firstpos < str.length()) { + c = str.at(firstpos); + ++firstpos; + if('<'==c) { + tagstream << "<"; // nested begin? + } else if('>'==c) { + break; + } else { + tagstream << c; // not very efficient, but tags should be quite short + } + } + + // we allow for several markup in docstrings, all else will become escaped + string tag_content = tagstream.str(); + string tag_key = tag_content; + size_t first_white = tag_key.find_first_of(" \t\f\v\n\r"); + if( first_white != string::npos) { + tag_key.erase(first_white); + } + for (std::string::size_type i=0; i"; + } else { + result << "<" << tagstream.str() << ">"; + pverbose("illegal markup <%s> in doc-comment\n", tag_key.c_str()); + } + } + + return result.str(); +} +std::string t_html_generator::escape_html(std::string const & str) { // the generated HTML header says it is UTF-8 encoded // if UTF-8 input has been detected before, we don't need to change anything if( input_type_ == INPUT_UTF8) { - return str; + return escape_html_tags(str); } // convert unsafe chars to their &#; equivalent @@ -495,6 +614,11 @@ std::string t_html_generator::escape_html(std::string const & str) { firstpos = lastpos; } + // reached the end? + if( firstpos >= str.length()) { + break; + } + // some control code? if( (0 <= ic) && (31 >= ic)) { @@ -502,10 +626,10 @@ std::string t_html_generator::escape_html(std::string const & str) { { case '\r' : case '\n' : - result << "
"; - break; case '\t' : - result << " "; + result << c; + break; + default: // silently consume all other ctrl chars break; } ++firstpos; @@ -538,7 +662,7 @@ std::string t_html_generator::escape_html(std::string const & str) { } } - return result.str(); + return escape_html_tags(result.str()); } /** -- 2.17.1