WebStrings.cc
Go to the documentation of this file.
00001 /* 00002 * Copyright 2006-2009 Savarese Software Research Corporation 00003 * 00004 * Licensed under the Apache License, Version 2.0 (the "License"); 00005 * you may not use this file except in compliance with the License. 00006 * You may obtain a copy of the License at 00007 * 00008 * https://www.savarese.com/software/ApacheLicense-2.0 00009 * 00010 * Unless required by applicable law or agreed to in writing, software 00011 * distributed under the License is distributed on an "AS IS" BASIS, 00012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00013 * See the License for the specific language governing permissions and 00014 * limitations under the License. 00015 */ 00016 #include <unordered_map> 00017 #include <algorithm> 00018 #include <cstring> 00019 #include <boost/algorithm/string/case_conv.hpp> 00020 #include <boost/algorithm/string/trim.hpp> 00021 #include <boost/regex.hpp> 00022 00023 #include <ssrc/wispers/utility/WebStrings.h> 00024 00025 // Note: we perform two passes on input, calculating the size of the 00026 // destination buffer on the first pass and performing the 00027 // substitutions on the second pass. We could do this in one pass, by 00028 // appending to a dynamic buffer or storing references to ranges of 00029 // the input that do not have to be altered (followed by a loop over 00030 // the ranges, writing to the output). However, the alternatives 00031 // would require O(N) dynamic memory allocations (either to resize the 00032 // buffer or to record a new range), where N is the number of 00033 // substitutions required by the input. We assume that with today's 00034 // large memory caches (and given that our input will almost always be 00035 // far less than 1 MB), it is cheaper for us to do two passes (the 00036 // first pass loads the data into the cache) and a single memory 00037 // allocation with individual character copies than one pass with N 00038 // memory allocations (possibly followed by a second loop to write the 00039 // output) and N memcpy's (even though the memcpy's will be faster than 00040 // our individual-character-copying loop). 00041 // 00042 // Also note that we could genercize the algorithms to share code 00043 // between them, using functors to specialize the behavior. We 00044 // choose not to do so for expediency of implementation. It 00045 // takes more time to figure out how to properly genericize the 00046 // code than it does to implement the utility functions independently. 00047 // 00048 // Regardless, performance optimizations and design refinements can 00049 // always be made in the future. Functionality comes first. 00050 00051 namespace { 00052 // TODO: optimize patterns. 00053 const boost::regex html_strip_pattern("<!--.*?-->|<[^>]*>"); 00054 const boost::regex html_strip_amp_pattern("<!--.*?-->|<[^>]*>|&[^#&;\\d\\s<>]+;|&#\\d+;|&#[xX][a-fA-F\\d]+;"); 00055 00056 // begin html_title_and_body support 00057 const boost::regex html_title_begin("<title[^>]*>", 00058 boost::regex_constants::normal | 00059 boost::regex_constants::icase); 00060 const boost::regex html_title_end("</title>", 00061 boost::regex_constants::normal | 00062 boost::regex_constants::icase); 00063 const boost::regex html_body_begin("<body[^>]*>", 00064 boost::regex_constants::normal | 00065 boost::regex_constants::icase); 00066 const boost::regex html_body_end("</body>", 00067 boost::regex_constants::normal | 00068 boost::regex_constants::icase); 00069 // end html_title_and_body support 00070 00071 const char hex_char[] = { 00072 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 00073 'a', 'b', 'c', 'd', 'e', 'f' 00074 }; 00075 00076 // Sorted array of characters that are either reserved according to 00077 // RFC 1738 or "unsafe". 00078 // This array is pre-sorted so that we can search it via a binary search. 00079 const char url_reserved_char[] = { 00080 ' ', '"', '#', '$', '%', '&', '\'', '+', ',', '/', ':', ';', '<', '=', '>', 00081 '?', '@', '[', '\\', ']', '^', '`', '{', '|', '}', '~' 00082 }; 00083 00084 // This array is pre-sorted so that we can search it via a binary search. 00085 // For such a small number of items, linear search or 00086 // compiler-optimized switch statement may be faster! 00087 const char html_escape_char[] = { 00088 '"', '&', '\'', '<', '>' 00089 }; 00090 00091 const char html_quot[] = { '&', 'q', 'u', 'o', 't', ';' }; 00092 const char html_amp[] = { '&', 'a', 'm', 'p', ';' }; 00093 const char html_apost[] = { '&', '#', '3', '9', ';' }; 00094 const char html_lt[] = { '&', 'l', 't', ';' }; 00095 const char html_gt[] = { '&', 'g', 't', ';' }; 00096 00097 struct HTMLEscapeInfo { 00098 const char *escape_str; 00099 const unsigned int size; 00100 }; 00101 00102 const HTMLEscapeInfo html_escape_info[] = { 00103 { 0, 1 }, 00104 { html_quot, sizeof(html_quot) }, 00105 { html_amp, sizeof(html_amp) }, 00106 { html_apost, sizeof(html_apost) }, 00107 { html_lt, sizeof(html_lt) }, 00108 { html_gt, sizeof(html_gt) } 00109 }; 00110 00111 struct HTMLEntity { 00112 const char * const entity; 00113 const unsigned char value; 00114 }; 00115 00116 const HTMLEntity html_entity[] = { 00117 { "agrave", 0xe0 }, 00118 { "aacute", 0xe1 }, 00119 { "acirc", 0xe2 }, 00120 { "atilde", 0xe3 }, 00121 { "auml", 0xe4 }, 00122 { "aring", 0xe5 }, 00123 { "aelig", 0xe6 }, 00124 { "amp", '&' }, 00125 { "ccedil", 0xe7 }, 00126 { "copy", 0xa9 }, 00127 { "egrave", 0xe8 }, 00128 { "eacute", 0xe9 }, 00129 { "ecirc", 0xea }, 00130 { "euml", 0xeb }, 00131 { "eth", 0xf0 }, 00132 { "igrave", 0xec }, 00133 { "iacute", 0xed }, 00134 { "icirc", 0xee }, 00135 { "iuml", 0xef }, 00136 { "gt", '>' }, 00137 { "lt", '<' }, 00138 { "ntilde", 0xf1 }, 00139 { "nbsp", ' ' }, 00140 { "ograve", 0xf2 }, 00141 { "oacute", 0xf3 }, 00142 { "ocirc", 0xf4 }, 00143 { "otilde", 0xf5 }, 00144 { "ouml", 0xf6 }, 00145 { "oslash", 0xf8 }, 00146 { "quot", '"' }, 00147 { "szlig", 0xdf }, 00148 { "thorn", 0xfe }, 00149 { "ugrave", 0xf9 }, 00150 { "uacute", 0xfa }, 00151 { "ucirc", 0xfb }, 00152 { "uuml", 0xfc }, 00153 { "yacute", 0xfd } 00154 }; 00155 00156 const unsigned int html_entity_size = sizeof(html_entity)/sizeof(HTMLEntity); 00157 00158 class HTMLEntityMap { 00159 typedef std::unordered_map<std::string, const unsigned char> map_type; 00160 map_type _map; 00161 00162 public: 00163 00164 HTMLEntityMap() { 00165 for(unsigned int i = 0; i < html_entity_size; ++i) { 00166 _map.insert(map_type::value_type(html_entity[i].entity, 00167 html_entity[i].value)); 00168 } 00169 } 00170 00171 unsigned char get(const std::string & key) const { 00172 map_type::const_iterator it = _map.find(key); 00173 00174 return (it == _map.end() ? 0 : it->second); 00175 } 00176 }; 00177 00178 const HTMLEntityMap html_entity_map; 00179 00180 inline bool is_reserved_url_char(const char c) { 00181 return 00182 (c < 32 || c > 122 || 00183 std::binary_search(&url_reserved_char[0], 00184 &url_reserved_char[sizeof(url_reserved_char)], c)); 00185 } 00186 00187 inline unsigned int html_escape_char_size(const char c) { 00188 switch(c) { 00189 case '"' : return sizeof(html_quot); 00190 case '&' : return sizeof(html_amp); 00191 case '\'': return sizeof(html_apost); 00192 case '<' : return sizeof(html_lt); 00193 case '>' : return sizeof(html_gt); 00194 default : return 1; 00195 }; 00196 } 00197 00198 inline unsigned int html_escape_char_index(const char c) { 00199 switch(c) { 00200 case '"' : return 1; 00201 case '&' : return 2; 00202 case '\'': return 3; 00203 case '<' : return 4; 00204 case '>' : return 5; 00205 default : return 0; 00206 }; 00207 } 00208 00209 /* 00210 inline bool is_html_escape_char(const char c) { 00211 return (html_escape_char_size(c) > 1); 00212 } 00213 00214 inline bool is_html_escape_char(const char c) { 00215 return std::binary_search(&html_escape_char[0], 00216 &html_escape_char[sizeof(html_escape_char)], c); 00217 } 00218 */ 00219 00220 inline char * escape_javascript_char(char *dest, const char c) { 00221 *dest++ = '\\'; 00222 *dest++ = 'x'; 00223 *dest++ = hex_char[((c >> 4) & 0x0f)]; 00224 *dest++ = hex_char[(c & 0x0f)]; 00225 return dest; 00226 } 00227 00228 inline char * escape_url_char(char *dest, const char c) { 00229 *dest++ = '%'; 00230 *dest++ = hex_char[((c >> 4) & 0x0f)]; 00231 *dest++ = hex_char[(c & 0x0f)]; 00232 return dest; 00233 } 00234 00235 inline unsigned int _escaped_javascript_size(const char *src, 00236 const unsigned int src_size) 00237 { 00238 unsigned int pos = 0, escaped_size = 0; 00239 00240 while(*src && pos++ < src_size) { 00241 if(*src >= 32) { 00242 switch(*src) { 00243 case '"': 00244 case '&': 00245 case '\'': 00246 case '/': 00247 case ';': 00248 case '<': 00249 case '>': 00250 case '\\': escaped_size+=4; break; 00251 default: ++escaped_size; break; 00252 }; 00253 } else { 00254 escaped_size+=4; 00255 } 00256 00257 ++src; 00258 } 00259 00260 return escaped_size; 00261 } 00262 00263 inline void _escape_javascript(char *dest, const char *src, 00264 const unsigned int src_size) 00265 { 00266 unsigned int pos = 0; 00267 00268 while(*src && pos++ < src_size) { 00269 if(*src >= 32) { 00270 switch(*src) { 00271 case '"': 00272 case '&': 00273 case '\'': 00274 case '/': 00275 case ';': 00276 case '<': 00277 case '>': 00278 case '\\': dest = escape_javascript_char(dest, *src); break; 00279 default: *dest++ = *src; break; 00280 }; 00281 } else { 00282 dest = escape_javascript_char(dest, *src); 00283 } 00284 00285 ++src; 00286 } 00287 } 00288 00289 00290 inline unsigned int _escaped_html_size(const char *src, 00291 const unsigned int src_size) 00292 { 00293 unsigned int pos = 0, escaped_size = 0; 00294 00295 while(*src && pos++ < src_size) { 00296 escaped_size+=html_escape_char_size(*src); 00297 ++src; 00298 } 00299 00300 return escaped_size; 00301 } 00302 00303 inline void _escape_html(char *dest, const char *src, 00304 const unsigned int src_size) 00305 { 00306 unsigned int pos = 0; 00307 00308 while(*src && pos++ < src_size) { 00309 const unsigned int index = html_escape_char_index(*src); 00310 00311 if(index == 0) { 00312 *dest++ = *src; 00313 } else { 00314 const HTMLEscapeInfo & info = html_escape_info[index]; 00315 std::memcpy(dest, info.escape_str, info.size); 00316 dest+=info.size; 00317 } 00318 00319 ++src; 00320 } 00321 } 00322 00323 inline unsigned int _escaped_url_size(const char *src, 00324 const unsigned int src_size) 00325 { 00326 unsigned int pos = 0, escaped_size = 0; 00327 00328 while(*src && pos++ < src_size) { 00329 escaped_size += (is_reserved_url_char(*src++) ? 3 : 1); 00330 } 00331 00332 return escaped_size; 00333 } 00334 00335 inline void _escape_url(char *dest, const char *src, 00336 const unsigned int src_size) 00337 { 00338 unsigned int pos = 0; 00339 00340 while(*src && pos++ < src_size) { 00341 if(is_reserved_url_char(*src)) { 00342 dest = escape_url_char(dest, *src); 00343 } else { 00344 *dest++ = *src; 00345 } 00346 ++src; 00347 } 00348 } 00349 00350 inline char * unescape_char_entity(char *dest, const char *src, 00351 const unsigned int src_size) 00352 { 00353 const char * const begin = src + 1; 00354 long value = 0; 00355 00356 // We can't place a temporary null guard in src because it may 00357 // be a read-only memory-mapped region. We trust strtol to end 00358 // when it encounters the terminating colon. 00359 if(*begin == '#') { 00360 if(std::tolower(*(begin + 1)) == 'x') { 00361 value = strtol(begin + 2, 0, 16); 00362 } else { 00363 value = strtol(begin + 1, 0, 10); 00364 } 00365 } else { 00366 std::string key(begin, src_size - 2); 00367 boost::to_lower(key); 00368 value = html_entity_map.get(key); 00369 } 00370 00371 if(value > 0 && value < 256) { 00372 *dest = value; 00373 return (dest + 1); 00374 } 00375 00376 return dest; 00377 } 00378 } 00379 00380 __BEGIN_NS_SSRC_WSPR_UTILITY 00381 00382 /* 00383 * Escapes characters in text that could result in executing 00384 * JavaScript code when passed to a JavaScript interpreter. 00385 * In addition, all leading and trailing whitespace is removed. 00386 * 00387 * @param result Buffer in which to store escaped text. The 00388 * string will be resized to hold the text. 00389 * @param text The text to be escaped. 00390 * @param text_size The length of the unescaped text. 00391 */ 00392 void escape_javascript(string & result, const char *text, 00393 const unsigned int text_size) 00394 { 00395 result.resize(_escaped_javascript_size(text, text_size)); 00396 _escape_javascript(&result[0], text, text_size); 00397 boost::algorithm::trim(result); 00398 } 00399 00410 void escape_html(string & result, const char *text, 00411 const unsigned int text_size) 00412 { 00413 result.resize(_escaped_html_size(text, text_size)); 00414 _escape_html(&result[0], text, text_size); 00415 boost::algorithm::trim(result); 00416 } 00417 00427 void escape_url(string & result, const char *text, 00428 const unsigned int text_size) 00429 { 00430 result.resize(_escaped_url_size(text, text_size)); 00431 _escape_url(&result[0], text, text_size); 00432 } 00433 00439 void unescape_url(string & url) { 00440 char d0, d1; 00441 string::iterator it = url.begin(), end = url.end(), next = it; 00442 00443 for(;next != end; ++it, ++next) { 00444 if(*next == '+') { 00445 *it = ' '; 00446 } else if(*next == '%' && (end - next) > 2 00447 && std::isxdigit((d0 = *(next + 1))) 00448 && std::isxdigit((d1 = *(next + 2)))) 00449 { 00450 #define FROM_HEX(c) (((c) >= 'A') ? (((c) & 0xdf) - 'A' + 10) : ((c) - '0')) 00451 char num = FROM_HEX(d0); 00452 num*=16; 00453 num+=FROM_HEX(d1); 00454 #undef FROM_HEX 00455 *it = num; 00456 next+=2; 00457 } else if(it != next) { 00458 *it = *next; 00459 } 00460 } 00461 00462 if(it != next) { 00463 url.resize(it - url.begin()); 00464 } 00465 } 00466 00475 void strip_html(string & result, const char *text, 00476 const unsigned int text_size) 00477 00478 { 00479 const boost::cregex_iterator end; 00480 boost::cregex_iterator it(text, text + text_size, html_strip_pattern); 00481 const char *src = text; 00482 00483 result.resize(text_size, ' '); 00484 00485 char *dest = &result[0]; 00486 00487 while(it != end) { 00488 const boost::cregex_iterator::reference m = *it; 00489 const boost::cregex_iterator::difference_type size = m[0].first - src; 00490 00491 if(size > 0) { 00492 std::memcpy(dest, src, size); 00493 dest+=size; 00494 } 00495 00496 src = m[0].second; 00497 ++it; 00498 } 00499 00500 const int tail_size = text_size - (src - text); 00501 if(tail_size > 0) { 00502 std::memcpy(dest, src, tail_size); 00503 } 00504 00505 boost::algorithm::trim(result); 00506 } 00507 00517 void strip_html_and_unescape(string & result, const char *text, 00518 const unsigned int text_size) 00519 { 00520 const boost::cregex_iterator end; 00521 boost::cregex_iterator it(text, text + text_size, html_strip_amp_pattern); 00522 const char *src = text; 00523 00524 result.resize(text_size, ' '); 00525 00526 char *dest = &result[0]; 00527 00528 while(it != end) { 00529 const boost::cregex_iterator::reference m = *it; 00530 const char *first = m[0].first; 00531 const boost::cregex_iterator::difference_type size = first - src; 00532 00533 if(size > 0) { 00534 std::memcpy(dest, src, size); 00535 dest+=size; 00536 } 00537 00538 if(*first == '&') { 00539 dest = unescape_char_entity(dest, first, m[0].length()); 00540 } 00541 00542 src = m[0].second; 00543 ++it; 00544 } 00545 00546 const int tail_size = text_size - (src - text); 00547 if(tail_size > 0) { 00548 std::memcpy(dest, src, tail_size); 00549 } 00550 00551 boost::algorithm::trim(result); 00552 } 00553 00563 void wrap_lines(char *text, const unsigned int text_size, 00564 const unsigned int max_length) 00565 { 00566 char * const end = text + text_size; 00567 char * pos = text; 00568 unsigned int length = 0; 00569 00570 while(pos < end) { 00571 pos = std::strchr(text, '\n'); 00572 00573 if(pos == 0) { 00574 pos = end; 00575 } 00576 00577 length = pos - text; 00578 00579 if(length > max_length) { 00580 char *s = 0; 00581 length = 0; 00582 for(char *p = text; p < pos; ++p) { 00583 if(*p == ' ' || *p == '\n') { 00584 s = p; 00585 } 00586 if(++length > max_length) { 00587 if(s != 0) { 00588 *s = '\n'; 00589 length = (p - s); 00590 s = 0; 00591 } 00592 } 00593 } 00594 } 00595 00596 text = pos + 1; 00597 } 00598 } 00599 00603 title_body_type html_title_and_body(const char *begin, const char *end) { 00604 title_body_type result(static_cast<const char *>(0), 0, 00605 static_cast<const char *>(0), 0); 00606 boost::cmatch match; 00607 00608 if(boost::regex_search(begin, end, match, html_title_begin)) { 00609 std::get<0>(result) = match[0].second; 00610 if(boost::regex_search(match[0].second, end, match, html_title_end)) { 00611 std::get<1>(result) = match[0].first - std::get<0>(result);; 00612 } 00613 } 00614 00615 if(boost::regex_search((std::get<1>(result) == 0 ? begin : 00616 std::get<0>(result) + std::get<1>(result)), 00617 end, match, html_body_begin)) 00618 { 00619 std::get<2>(result) = match[0].second; 00620 if(boost::regex_search(std::get<2>(result), end, match, html_body_end)) { 00621 std::get<3>(result) = match[0].first - std::get<2>(result); 00622 } 00623 } 00624 00625 return result; 00626 } 00627 00628 __END_NS_SSRC_WSPR_UTILITY
Copyright © 2006-2011 Savarese Software Research Corporation. All rights reserved.