From: Jens Geyer <jensg@apache.org>
Date: Mon, 24 Dec 2012 09:32:58 +0000 (+0100)
Subject: THRIFT-1800 Documentation text not always escaped correctly when rendered to HTML
X-Git-Tag: 0.9.1~218
X-Git-Url: https://source.supwisdom.com/gerrit/gitweb?a=commitdiff_plain;h=63e3c6307806f58a0325a1fe895e7c7f6b73d6f3;p=common%2Fthrift.git

THRIFT-1800 Documentation text not always escaped correctly when rendered to HTML
Patch: Jens Geyer
---

diff --git a/compiler/cpp/src/generate/t_html_generator.cc b/compiler/cpp/src/generate/t_html_generator.cc
index 59993eab..6838706d 100644
--- a/compiler/cpp/src/generate/t_html_generator.cc
+++ b/compiler/cpp/src/generate/t_html_generator.cc
@@ -32,6 +32,8 @@
 using namespace std;
 
 
+enum input_type { INPUT_UNKNOWN, INPUT_UTF8, INPUT_PLAIN };
+
 /**
  * HTML code generator
  *
@@ -48,6 +50,7 @@ class t_html_generator : public t_generator {
     (void) parsed_options;
     (void) option_string;  
     out_dir_base_ = "gen-html";
+    input_type_ = INPUT_UNKNOWN;
     
     std::map<std::string, std::string>::const_iterator iter;
     iter = parsed_options.find("standalone");
@@ -67,10 +70,13 @@ class t_html_generator : public t_generator {
   void generate_program_toc_rows(t_program* tprog,
          std::vector<t_program*>& finished);
   void generate_index();
+  std::string escape_html(std::string const & str);
   void generate_css();
   void generate_css_content(std::ofstream & f_target);
   void generate_style_tag();
   std::string make_file_link( std::string name);
+  bool is_utf8_sequence(std::string const & str, size_t firstpos);
+  void detect_input_encoding(std::string const & str, size_t firstpos);
   
   /**
    * Program-level generation functions
@@ -91,10 +97,12 @@ class t_html_generator : public t_generator {
  private:
   std::ofstream f_out_;
   std::string  current_file_;
+  input_type input_type_;
  
   bool standalone_; 
 };
 
+
 /**
  * Emits the Table of Contents links at the top of the module's page
  */
@@ -301,7 +309,7 @@ void t_html_generator::generate_program() {
 
   f_out_ << "</div></body></html>" << endl;
   f_out_.close();
-
+  
   generate_index();
   generate_css();
 }
@@ -383,18 +391,146 @@ void t_html_generator::print_doc(t_doc* tdoc) {
     size_t index;
     while ((index = doc.find_first_of("\r\n")) != string::npos) {
       if (index == 0) {
-  f_out_ << "<p/>" << endl;
+        f_out_ << "<p/>" << endl;
       } else {
-  f_out_ << doc.substr(0, index) << endl;
+        f_out_ << escape_html( doc.substr(0, index)) << endl;
       }
       if (index + 1 < doc.size() && doc.at(index) != doc.at(index + 1) &&
-    (doc.at(index + 1) == '\r' || doc.at(index + 1) == '\n')) {
-  index++;
+         (doc.at(index + 1) == '\r' || doc.at(index + 1) == '\n')) {
+        index++;
       }
       doc = doc.substr(index + 1);
     }
-    f_out_ << doc << "<br/>";
+    f_out_ << escape_html(doc) << "<br/>";
+  }
+}
+
+bool t_html_generator::is_utf8_sequence(std::string const & str, size_t firstpos) {
+  // leading char determines the length of the sequence
+  unsigned char c = str.at(firstpos);
+  int count = 0;
+  if(        (c & 0xE0) == 0xC0) {
+    count = 1;
+  } else if( (c & 0xF0) == 0xE0) {
+    count = 2;
+  } else if( (c & 0xF8) == 0xF0) {
+    count = 3;
+  } else if( (c & 0xFC) == 0xF8) {
+    count = 4;
+  } else if( (c & 0xFE) == 0xFC) {
+    count = 5;
+  } else {
+    //pdebug("UTF-8 test: char '%c' (%d) is not a valid UTF-8 leading byte", c, int(c));
+    return false;  // no UTF-8
+  }
+
+  // following chars
+  size_t pos = firstpos + 1;
+  while( (pos < str.length()) && (0 < count))
+  {
+    c = str.at(pos);
+    if( (c & 0xC0) !=  0x80) {
+      //pdebug("UTF-8 test: char '%c' (%d) is not a valid UTF-8 following byte", c, int(c));
+      return false;  // no UTF-8
+    }    
+    --count;
+    ++pos;
+  }
+  
+  // true if the sequence is complete
+  return (0 == count);
+}
+
+void t_html_generator::detect_input_encoding(std::string const & str, size_t firstpos) {
+  if( is_utf8_sequence(str,firstpos))
+  {
+    pdebug( "Input seems to be already UTF-8 encoded");
+    input_type_ = INPUT_UTF8;
+    return;
   }
+
+  // fallback 
+  pwarning( 1, "Input is not UTF-8, treating as plain ANSI");
+  input_type_ = INPUT_PLAIN;
+}
+
+std::string t_html_generator::escape_html(std::string const & str) {
+
+  // the generated HTML header says it is UTF-8 encoded
+  // if UTF-8 input has been detected before, we don't need to change anything
+  if( input_type_ == INPUT_UTF8) {
+    return str;
+  }
+  
+  // convert unsafe chars to their &#<num>; equivalent
+  std::ostringstream result;
+  unsigned char c = '?';
+  unsigned int ic = 0;
+  size_t lastpos;
+  size_t firstpos = 0;
+  while( firstpos < str.length()) {
+
+    // look for non-ASCII char  
+    lastpos = firstpos;    
+    while( lastpos < str.length()) {
+      c = str.at(lastpos);
+      ic = c;
+      if( (32 > ic) || (127 < ic)) {
+        break;
+      }
+      ++lastpos;
+    }
+    
+    // copy what we got so far    
+    if( lastpos > firstpos) {
+      result << str.substr( firstpos, lastpos-firstpos);
+      firstpos = lastpos;
+    }
+
+    // some control code?
+    if( (0 <= ic) && (31 >= ic))
+    {
+      switch( c)
+      {
+        case '\r' :  
+        case '\n' :  
+          result << "<br/>";  
+          break;
+        case '\t' :
+          result << " ";  
+          break;
+      }
+      ++firstpos;
+      continue;        
+    }
+    
+    // reached the end?
+    if( firstpos >= str.length()) {
+      break;
+    }
+
+    // try to detect input encoding
+    if( input_type_ == INPUT_UNKNOWN) {
+      detect_input_encoding(str,firstpos);
+      if( input_type_ == INPUT_UTF8) {
+        lastpos = str.length();
+        result << str.substr( firstpos, lastpos-firstpos);
+        break;
+      }
+    }
+    
+    // convert the character to something useful based on the detected encoding
+    switch( input_type_) {
+      case INPUT_PLAIN: 
+        result << "&#" << ic << ";";  
+        ++firstpos;
+        break;
+      default:
+        throw "Unexpected or unrecognized input encoding";
+    }
+  }
+  
+  return result.str();
 }
 
 /**
@@ -460,7 +596,7 @@ void t_html_generator::print_const_value(t_const_value* tvalue) {
     f_out_ << tvalue->get_double();
     break;
   case t_const_value::CV_STRING:
-    f_out_ << '"' << get_escaped_string(tvalue) << '"';
+    f_out_ << '"' << escape_html( get_escaped_string(tvalue)) << '"';
     break;
   case t_const_value::CV_MAP:
     {
@@ -521,7 +657,7 @@ void t_html_generator::print_fn_args_doc(t_function* tfunction) {
       for ( ; arg_iter != args.end(); arg_iter++) {
         f_out_ << "<tr><td>" << (*arg_iter)->get_name();
         f_out_ << "</td><td>";
-        f_out_ << (*arg_iter)->get_doc();
+        f_out_ << escape_html( (*arg_iter)->get_doc());
         f_out_ << "</td></tr>" << endl;
       }
       f_out_ << "</table>";
@@ -545,7 +681,7 @@ void t_html_generator::print_fn_args_doc(t_function* tfunction) {
       for ( ; ex_iter != excepts.end(); ex_iter++) {
         f_out_ << "<tr><td>" << (*ex_iter)->get_type()->get_name();
         f_out_ << "</td><td>";
-        f_out_ << (*ex_iter)->get_doc();
+        f_out_ << escape_html( (*ex_iter)->get_doc());
         f_out_ << "</td></tr>" << endl;
       }
       f_out_ << "</table>";
@@ -640,7 +776,7 @@ void t_html_generator::generate_struct(t_struct* tstruct) {
     f_out_ << "</td><td>";
     print_type((*mem_iter)->get_type());
     f_out_ << "</td><td>";
-    f_out_ << (*mem_iter)->get_doc();
+    f_out_ << escape_html( (*mem_iter)->get_doc());
     f_out_ << "</td><td>";
     if ((*mem_iter)->get_req() == t_field::T_OPTIONAL) {
       f_out_ << "optional";