Web Wispers C++ API 1.2.2: WebStrings.cc Source File

WebStrings.cc
00001 /*
00002  * Copyright 2006-2009 Savarese Software Research Corporation
00003  *
00004  * Licensed under the Apache License, Version 2.0 (the "License");
00005  * you may not use this file except in compliance with the License.
00006  * You may obtain a copy of the License at
00007  *
00008  *     https://www.savarese.com/software/ApacheLicense-2.0
00009  *
00010  * Unless required by applicable law or agreed to in writing, software
00011  * distributed under the License is distributed on an "AS IS" BASIS,
00012  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00013  * See the License for the specific language governing permissions and
00014  * limitations under the License.
00015  */
00016 #include <unordered_map>
00017 #include <algorithm>
00018 #include <cstring>
00019 #include <boost/algorithm/string/case_conv.hpp>
00020 #include <boost/algorithm/string/trim.hpp>
00021 #include <boost/regex.hpp>
00022 
00023 #include <ssrc/wispers/utility/WebStrings.h>
00024 
00025 // Note: we perform two passes on input, calculating the size of the
00026 // destination buffer on the first pass and performing the
00027 // substitutions on the second pass.  We could do this in one pass, by
00028 // appending to a dynamic buffer or storing references to ranges of
00029 // the input that do not have to be altered (followed by a loop over
00030 // the ranges, writing to the output).  However, the alternatives
00031 // would require O(N) dynamic memory allocations (either to resize the
00032 // buffer or to record a new range), where N is the number of
00033 // substitutions required by the input.  We assume that with today's
00034 // large memory caches (and given that our input will almost always be
00035 // far less than 1 MB), it is cheaper for us to do two passes (the
00036 // first pass loads the data into the cache) and a single memory
00037 // allocation with individual character copies than one pass with N
00038 // memory allocations (possibly followed by a second loop to write the
00039 // output) and N memcpy's (even though the memcpy's will be faster than
00040 // our individual-character-copying loop).
00041 //
00042 // Also note that we could genercize the algorithms to share code
00043 // between them, using functors to specialize the behavior.  We
00044 // choose not to do so for expediency of implementation.  It
00045 // takes more time to figure out how to properly genericize the
00046 // code than it does to implement the utility functions independently.
00047 //
00048 // Regardless, performance optimizations and design refinements can
00049 // always be made in the future.  Functionality comes first.
00050 
00051 namespace {
00052   // TODO: optimize patterns.
00053   const boost::regex html_strip_pattern("<!--.*?-->|<[^>]*>");
00054   const boost::regex html_strip_amp_pattern("<!--.*?-->|<[^>]*>|&[^#&;\\d\\s<>]+;|&#\\d+;|&#[xX][a-fA-F\\d]+;");
00055 
00056   // begin html_title_and_body support
00057   const boost::regex html_title_begin("<title[^>]*>",
00058                                       boost::regex_constants::normal |
00059                                       boost::regex_constants::icase);
00060   const boost::regex html_title_end("</title>",
00061                                       boost::regex_constants::normal |
00062                                       boost::regex_constants::icase);
00063   const boost::regex html_body_begin("<body[^>]*>",
00064                                      boost::regex_constants::normal |
00065                                      boost::regex_constants::icase);
00066   const boost::regex html_body_end("</body>",
00067                                    boost::regex_constants::normal |
00068                                    boost::regex_constants::icase);
00069   // end html_title_and_body support
00070 
00071   const char hex_char[] = {
00072     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
00073     'a', 'b', 'c', 'd', 'e', 'f'
00074   };
00075 
00076   // Sorted array of characters that are either reserved according to
00077   // RFC 1738 or "unsafe". 
00078   // This array is pre-sorted so that we can search it via a binary search.
00079   const char url_reserved_char[] = {
00080     ' ', '"', '#', '$', '%', '&', '\'', '+', ',', '/', ':', ';', '<', '=', '>',
00081     '?', '@', '[', '\\', ']', '^', '`', '{', '|', '}', '~'
00082   };
00083 
00084   // This array is pre-sorted so that we can search it via a binary search.
00085   // For such a small number of items, linear search or
00086   // compiler-optimized switch statement may be faster!
00087   const char html_escape_char[] = {
00088     '"', '&', '\'', '<', '>'
00089   };
00090 
00091   const char html_quot[]  = { '&', 'q', 'u', 'o', 't', ';' };
00092   const char html_amp[]   = { '&', 'a', 'm', 'p', ';' };
00093   const char html_apost[] = { '&', '#', '3', '9', ';' };
00094   const char html_lt[]    = { '&', 'l', 't', ';' };
00095   const char html_gt[]    = { '&', 'g', 't', ';' };
00096 
00097   struct HTMLEscapeInfo {
00098     const char *escape_str;
00099     const unsigned int size;
00100   };
00101 
00102   const HTMLEscapeInfo html_escape_info[] = {
00103     { 0, 1 },
00104     { html_quot, sizeof(html_quot) },
00105     { html_amp, sizeof(html_amp) },
00106     { html_apost, sizeof(html_apost) },
00107     { html_lt, sizeof(html_lt) },
00108     { html_gt, sizeof(html_gt) }
00109   };
00110 
00111   struct HTMLEntity {
00112     const char * const entity;
00113     const unsigned char value;
00114   };
00115 
00116   const HTMLEntity html_entity[] = {
00117     { "agrave", 0xe0 },
00118     { "aacute", 0xe1 },
00119     { "acirc", 0xe2 },
00120     { "atilde", 0xe3 },
00121     { "auml", 0xe4 },
00122     { "aring", 0xe5 },
00123     { "aelig", 0xe6 },
00124     { "amp", '&' },
00125     { "ccedil", 0xe7 },
00126     { "copy", 0xa9 },
00127     { "egrave", 0xe8 },
00128     { "eacute", 0xe9 },
00129     { "ecirc", 0xea },
00130     { "euml", 0xeb },
00131     { "eth", 0xf0 },
00132     { "igrave", 0xec },
00133     { "iacute", 0xed },
00134     { "icirc", 0xee },
00135     { "iuml", 0xef },
00136     { "gt", '>' },
00137     { "lt", '<' },
00138     { "ntilde", 0xf1 },
00139     { "nbsp", ' ' },
00140     { "ograve", 0xf2 },
00141     { "oacute", 0xf3 },
00142     { "ocirc", 0xf4 },
00143     { "otilde", 0xf5 },
00144     { "ouml", 0xf6 },
00145     { "oslash", 0xf8 },
00146     { "quot", '"' },
00147     { "szlig", 0xdf },
00148     { "thorn", 0xfe },
00149     { "ugrave", 0xf9 },
00150     { "uacute", 0xfa },
00151     { "ucirc", 0xfb },
00152     { "uuml", 0xfc },
00153     { "yacute", 0xfd }
00154   };
00155 
00156   const unsigned int html_entity_size = sizeof(html_entity)/sizeof(HTMLEntity);
00157 
00158   class HTMLEntityMap {
00159     typedef std::unordered_map<std::string, const unsigned char> map_type;
00160     map_type _map;
00161 
00162   public:
00163 
00164     HTMLEntityMap() {
00165       for(unsigned int i = 0; i < html_entity_size; ++i) {
00166         _map.insert(map_type::value_type(html_entity[i].entity,
00167                                          html_entity[i].value));
00168       }
00169     }
00170 
00171     unsigned char get(const std::string & key) const {
00172       map_type::const_iterator it = _map.find(key);
00173 
00174       return (it == _map.end() ? 0 : it->second);
00175     }
00176   };
00177 
00178   const HTMLEntityMap html_entity_map;
00179 
00180   inline bool is_reserved_url_char(const char c) {
00181     return
00182       (c < 32 || c > 122 ||
00183        std::binary_search(&url_reserved_char[0],
00184                           &url_reserved_char[sizeof(url_reserved_char)], c));
00185   }
00186 
00187   inline unsigned int html_escape_char_size(const char c) {
00188     switch(c) {
00189     case '"' : return sizeof(html_quot);
00190     case '&' : return sizeof(html_amp);
00191     case '\'': return sizeof(html_apost);
00192     case '<' : return sizeof(html_lt);
00193     case '>' : return sizeof(html_gt);
00194     default  : return 1;
00195     };
00196   }
00197 
00198   inline unsigned int html_escape_char_index(const char c) {
00199     switch(c) {
00200     case '"' : return 1;
00201     case '&' : return 2;
00202     case '\'': return 3;
00203     case '<' : return 4;
00204     case '>' : return 5;
00205     default  : return 0;
00206     };
00207   }
00208 
00209   /*
00210   inline bool is_html_escape_char(const char c) {
00211     return (html_escape_char_size(c) > 1);
00212   }
00213 
00214   inline bool is_html_escape_char(const char c) {
00215     return std::binary_search(&html_escape_char[0],
00216                               &html_escape_char[sizeof(html_escape_char)], c);
00217   }
00218   */
00219 
00220   inline char * escape_javascript_char(char *dest, const char c) {
00221     *dest++ = '\\';
00222     *dest++ = 'x';
00223     *dest++ = hex_char[((c >> 4) & 0x0f)];
00224     *dest++ = hex_char[(c & 0x0f)];
00225     return dest;
00226   }
00227 
00228   inline char * escape_url_char(char *dest, const char c) {
00229     *dest++ = '%';
00230     *dest++ = hex_char[((c >> 4) & 0x0f)];
00231     *dest++ = hex_char[(c & 0x0f)];
00232     return dest;
00233   }
00234 
00235   inline unsigned int _escaped_javascript_size(const char *src,
00236                                                const unsigned int src_size)
00237   {
00238     unsigned int pos = 0, escaped_size = 0;
00239 
00240     while(*src && pos++ < src_size) {
00241       if(*src >= 32) {
00242         switch(*src) {
00243         case '"':
00244         case '&':
00245         case '\'':
00246         case '/':
00247         case ';':
00248         case '<':
00249         case '>':
00250         case '\\': escaped_size+=4; break;
00251         default: ++escaped_size; break;
00252         };
00253       } else {
00254         escaped_size+=4;
00255       }
00256 
00257       ++src;
00258     }
00259 
00260     return escaped_size;
00261   }
00262 
00263   inline void _escape_javascript(char *dest, const char *src,
00264                                  const unsigned int src_size)
00265   {
00266     unsigned int pos = 0;
00267 
00268     while(*src && pos++ < src_size) {
00269       if(*src >= 32) {
00270         switch(*src) {
00271         case '"':
00272         case '&':
00273         case '\'':
00274         case '/':
00275         case ';':
00276         case '<':
00277         case '>':
00278         case '\\': dest = escape_javascript_char(dest, *src); break;
00279         default: *dest++ = *src; break;
00280         };
00281       } else {
00282         dest = escape_javascript_char(dest, *src);
00283       }
00284 
00285       ++src;
00286     }
00287   }
00288 
00289 
00290   inline unsigned int _escaped_html_size(const char *src,
00291                                          const unsigned int src_size)
00292   {
00293     unsigned int pos = 0, escaped_size = 0;
00294 
00295     while(*src && pos++ < src_size) {
00296       escaped_size+=html_escape_char_size(*src);
00297       ++src;
00298     }
00299 
00300     return escaped_size;
00301   }
00302 
00303   inline void _escape_html(char *dest, const char *src,
00304                            const unsigned int src_size)
00305   {
00306     unsigned int pos = 0;
00307 
00308     while(*src && pos++ < src_size) {
00309       const unsigned int index = html_escape_char_index(*src);
00310 
00311       if(index == 0) {
00312         *dest++ = *src;
00313       } else {
00314         const HTMLEscapeInfo & info = html_escape_info[index];
00315         std::memcpy(dest, info.escape_str, info.size);
00316         dest+=info.size;
00317       }
00318 
00319       ++src;
00320     }
00321   }
00322 
00323   inline unsigned int _escaped_url_size(const char *src,
00324                                         const unsigned int src_size)
00325   {
00326     unsigned int pos = 0, escaped_size = 0;
00327 
00328     while(*src && pos++ < src_size) {
00329       escaped_size += (is_reserved_url_char(*src++) ? 3 : 1);
00330     }
00331 
00332     return escaped_size;
00333   }
00334 
00335   inline void _escape_url(char *dest, const char *src,
00336                           const unsigned int src_size)
00337   {
00338     unsigned int pos = 0;
00339 
00340     while(*src && pos++ < src_size) {
00341       if(is_reserved_url_char(*src)) {
00342         dest = escape_url_char(dest, *src);
00343       } else {
00344         *dest++ = *src;
00345       }
00346       ++src;
00347     }
00348   }
00349 
00350   inline char * unescape_char_entity(char *dest, const char *src,
00351                                      const unsigned int src_size)
00352   {
00353     const char * const begin = src + 1;
00354     long value = 0;
00355 
00356     // We can't place a temporary null guard in src because it may
00357     // be a read-only memory-mapped region.  We trust strtol to end
00358     // when it encounters the terminating colon.
00359     if(*begin == '#') {
00360       if(std::tolower(*(begin + 1)) == 'x') {
00361         value = strtol(begin + 2, 0, 16);
00362       } else {
00363         value = strtol(begin + 1, 0, 10);
00364       }
00365     } else {
00366       std::string key(begin, src_size - 2);
00367       boost::to_lower(key);
00368       value = html_entity_map.get(key);
00369     }
00370 
00371     if(value > 0 && value < 256) {
00372       *dest = value;
00373       return (dest + 1);
00374     }
00375 
00376     return dest;
00377   }
00378 }
00379 
00380 __BEGIN_NS_SSRC_WSPR_UTILITY
00381 
00382 /*
00383  * Escapes characters in text that could result in executing
00384  * JavaScript code when passed to a JavaScript interpreter.
00385  * In addition, all leading and trailing whitespace is removed.
00386  *
00387  * @param result Buffer in which to store escaped text.  The
00388  *        string will be resized to hold the text.
00389  * @param text The text to be escaped.
00390  * @param text_size The length of the unescaped text.
00391  */
00392 void escape_javascript(string & result, const char *text,
00393                        const unsigned int text_size)
00394 {
00395   result.resize(_escaped_javascript_size(text, text_size));
00396   _escape_javascript(&result[0], text, text_size);
00397   boost::algorithm::trim(result);
00398 }
00399 
00410 void escape_html(string & result, const char *text,
00411                  const unsigned int text_size)
00412 {
00413   result.resize(_escaped_html_size(text, text_size));
00414   _escape_html(&result[0], text, text_size);
00415   boost::algorithm::trim(result);
00416 }
00417 
00427 void escape_url(string & result, const char *text,
00428                 const unsigned int text_size)
00429 {
00430   result.resize(_escaped_url_size(text, text_size));
00431   _escape_url(&result[0], text, text_size);
00432 }
00433 
00439 void unescape_url(string & url) {
00440   char d0, d1;
00441   string::iterator it = url.begin(), end = url.end(), next = it;
00442 
00443   for(;next != end; ++it, ++next) {
00444     if(*next == '+') {
00445       *it = ' ';
00446     } else if(*next == '%' && (end - next) > 2
00447               && std::isxdigit((d0 = *(next + 1)))
00448               && std::isxdigit((d1 = *(next + 2))))
00449     {
00450 #define FROM_HEX(c) (((c) >= 'A') ? (((c) & 0xdf) - 'A' + 10) : ((c) - '0'))
00451       char num = FROM_HEX(d0);
00452       num*=16;
00453       num+=FROM_HEX(d1);
00454 #undef FROM_HEX
00455       *it = num;
00456       next+=2;
00457     } else if(it != next) {
00458       *it = *next;
00459     }
00460   }
00461 
00462   if(it != next) {
00463     url.resize(it - url.begin());
00464   }
00465 }
00466 
00475 void strip_html(string & result, const char *text,
00476                 const unsigned int text_size)
00477 
00478 {
00479   const boost::cregex_iterator end;
00480   boost::cregex_iterator it(text, text + text_size, html_strip_pattern);
00481   const char *src = text;
00482 
00483   result.resize(text_size, ' ');
00484 
00485   char *dest = &result[0];
00486 
00487   while(it != end) {
00488     const boost::cregex_iterator::reference m = *it;
00489     const boost::cregex_iterator::difference_type size = m[0].first - src;
00490 
00491     if(size > 0) {
00492       std::memcpy(dest, src, size);
00493       dest+=size;
00494     }
00495 
00496     src = m[0].second;
00497     ++it;
00498   }
00499 
00500   const int tail_size = text_size - (src - text);
00501   if(tail_size > 0) {
00502     std::memcpy(dest, src, tail_size);
00503   }
00504 
00505   boost::algorithm::trim(result);
00506 }
00507 
00517 void strip_html_and_unescape(string & result, const char *text,
00518                              const unsigned int text_size)
00519 {
00520   const boost::cregex_iterator end;
00521   boost::cregex_iterator it(text, text + text_size, html_strip_amp_pattern);
00522   const char *src = text;
00523 
00524   result.resize(text_size, ' ');
00525 
00526   char *dest = &result[0];
00527 
00528   while(it != end) {
00529     const boost::cregex_iterator::reference m = *it;
00530     const char *first = m[0].first;
00531     const boost::cregex_iterator::difference_type size = first - src;
00532 
00533     if(size > 0) {
00534       std::memcpy(dest, src, size);
00535       dest+=size;
00536     }
00537 
00538     if(*first == '&') {
00539       dest = unescape_char_entity(dest, first, m[0].length());
00540     }
00541 
00542     src = m[0].second;
00543     ++it;
00544   }
00545 
00546   const int tail_size = text_size - (src - text);
00547   if(tail_size > 0) {
00548     std::memcpy(dest, src, tail_size);
00549   }
00550 
00551   boost::algorithm::trim(result);
00552 }
00553 
00563 void wrap_lines(char *text, const unsigned int text_size,
00564                 const unsigned int max_length)
00565 {
00566   char * const end = text + text_size;
00567   char * pos = text;
00568   unsigned int length = 0;
00569 
00570   while(pos < end) {
00571     pos = std::strchr(text, '\n');
00572 
00573     if(pos == 0) {
00574       pos = end;
00575     }
00576 
00577     length = pos - text;
00578 
00579     if(length > max_length) {
00580       char *s = 0;
00581       length = 0;
00582       for(char *p = text; p < pos; ++p) {
00583         if(*p == ' ' || *p == '\n') {
00584           s = p;
00585         }
00586         if(++length > max_length) {
00587           if(s != 0) {
00588             *s = '\n';
00589             length = (p - s);
00590             s = 0;
00591           }
00592         }
00593       }
00594     }
00595 
00596     text = pos + 1;
00597   }
00598 }
00599 
00603 title_body_type html_title_and_body(const char *begin, const char *end) {
00604   title_body_type result(static_cast<const char *>(0), 0,
00605                          static_cast<const char *>(0), 0);
00606   boost::cmatch match;
00607 
00608   if(boost::regex_search(begin, end, match, html_title_begin)) {
00609     std::get<0>(result) = match[0].second;
00610     if(boost::regex_search(match[0].second, end, match, html_title_end)) {
00611       std::get<1>(result) = match[0].first - std::get<0>(result);;
00612     }
00613   }
00614 
00615   if(boost::regex_search((std::get<1>(result) == 0 ? begin :
00616                           std::get<0>(result) + std::get<1>(result)),
00617                          end, match, html_body_begin))
00618   {
00619     std::get<2>(result) = match[0].second;
00620     if(boost::regex_search(std::get<2>(result), end, match, html_body_end)) {
00621       std::get<3>(result) = match[0].first - std::get<2>(result);
00622     }
00623   }
00624 
00625   return result;
00626 }
00627 
00628 __END_NS_SSRC_WSPR_UTILITY