// -*- mode: c++; c-basic-offset:4 -*- // This file is part of libdap, A C++ implementation of the OPeNDAP Data // Access Protocol. // Copyright (c) 2002,2003 OPeNDAP, Inc. // Author: James Gallagher // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA // // You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112. // Copyright (c) 1996, California Institute of Technology. // ALL RIGHTS RESERVED. U.S. Government Sponsorship acknowledged. // // Please read the full copyright notice in the file COPYRIGHT_URI // in this directory. // // Author: Todd Karakashian, NASA/Jet Propulsion Laboratory // Todd.K.Karakashian@jpl.nasa.gov // // $RCSfile: escaping.cc,v $ - Miscellaneous routines for OPeNDAP HDF server // // These two routines are for escaping/unescaping strings that are identifiers // in DAP2 // id2www() -- escape (using WWW hex codes) non-allowable characters in a // DAP2 identifier // www2id() -- given an WWW hexcode escaped identifier, restore it // // These two routines are for escaping/unescaping strings storing attribute // values. They use traditional octal escapes (\nnn) because they are // intended to be viewed by a user // escattr() -- escape (using traditional octal backslash) non-allowable // characters in the value of a DAP2 attribute // unescattr() -- given an octally escaped string, restore it // // These are routines used by the above, not intended to be called directly: // // hexstring() // unhexstring() // octstring() // unoctstring() // // -Todd #include "config.h" #include #include #include #include #include "GNURegex.h" #include "Error.h" #include "InternalErr.h" //#define DODS_DEBUG #include "debug.h" using namespace std; namespace libdap { // The next four functions were originally defined static, but I removed that // to make testing them (see generalUtilTest.cc) easier to write. 5/7/2001 // jhrg string hexstring(unsigned char val) { ostringstream buf; buf << hex << setw(2) << setfill('0') << static_cast(val); return buf.str(); } string unhexstring(string s) { int val; istringstream ss(s); ss >> hex >> val; char tmp_str[2]; tmp_str[0] = static_cast(val); tmp_str[1] = '\0'; return string(tmp_str); } string octstring(unsigned char val) { ostringstream buf; buf << oct << setw(3) << setfill('0') << static_cast(val); return buf.str(); } string unoctstring(string s) { int val; istringstream ss(s); ss >> oct >> val; DBG(cerr << "unoctstring: " << val << endl); char tmp_str[2]; tmp_str[0] = static_cast(val); tmp_str[1] = '\0'; return string(tmp_str); } /** Replace characters that are not allowed in DAP2 identifiers. -In the DAP itself, id2www() is called in: -# Array::print_decl() where dimension names are escaped -# AttrTable::print() (which calls AttrTable::simple_print()) where attribute names are escaped -# BaseType::print_decl() where variable names are escaped. -# Constructor::print_decl() where the name of the constructor type is printed. -# DDS::print() and DDS::print_constrained() where the name of the dataset is printed. -# Grid::print_decl() where the name of the grid is printed. -In the client code: -# id2www_ce() is called five times in the five methods that are used to request responses where a CE is appended to a URL (Connect::request_version, request_protocol, request_das, request_dds, request_data). @param in Replace characters in this string. @param allowable The set of characters that are allowed in a URI. default: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-+_/.\\*" @see id2www_ce() @return The modified identifier. */ string id2www(string in, const string &allowable) { string::size_type i = 0; DBG(cerr<<"Input string: [" << in << "]" << endl); while ((i = in.find_first_not_of(allowable, i)) != string::npos) { DBG(cerr<<"Found escapee: [" << in[i] << "]"); in.replace(i, 1, "%" + hexstring(in[i])); DBGN(cerr<<" now the string is: " << in << endl); i += 3;//i++; } return in; } /** Replace characters that are not allowed in WWW URLs using rules specific to Constraint Expressions. This has changed over time and now the only difference is that '*' is escaped by this function while it is not escaped by id2www(). @param in The string in which to replace characters. @param allowable The set of characters that are allowed in a URI. default: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-+_/.\\" @see id2www() @return The modified identifier. */ string id2www_ce(string in, const string &allowable) { return id2www(in, allowable); } /** Given a string that contains WWW escape sequences, translate those escape sequences back into the ASCII characters they represent. Return the modified string. -Places in the dap code where www2id() is called: -# Array::append_dim() the name is decoded before it is added -# AttrTable::set_name(), AttrTable::append_attr(), AttrTable::append_container(), AttrTable::del_attr(), AttrTable::add_container_alias(), AttrTable::add_value_alias() names are decoded before that are set/used. -# BaseType::set_name() Names are decoded before they are set -# When the constraint expression parser looks for a variable, the name is first decoded. -# DAS::DAS() Named attribute containers are decoded -# DDS::var() When a DDS searches for a variable, the name is first decoded. -# Grid::var(), Sequence::var(), Structure::var() Variable names are decoded. -In the server code: -# ResponseBuilder::initialize() The dataset name is decoded except that %20 is not removed. -# ResponseBuilder::set_ce() The CE is decoded, except for spaces (%20). -# ResponseBuilder::set_dataset_name() same logic as the first case. -# The ResponseBuilder methods supersede methods with the same names from DODSFilter, which is still in the code although deprecated. @param in The string to modify. @param escape The character used to signal the beginning of an escape sequence. default: "%" @param except If there are some escape codes that should not be removed by this call (e.g., you might not want to remove spaces, %20) use this parameter to specify those codes. The function will then transform all escapes \e except those given. For example, to suppress translation of both spaces and the ampersand, pass "%20%26" for 'except'. default: "" @return The modified string. */ string www2id(const string &in, const string &escape, const string &except) { string::size_type i = 0; string res = in; while ((i = res.find_first_of(escape, i)) != string::npos) { if (except.find(res.substr(i, 3)) != string::npos) { i += 3; continue; } res.replace(i, 3, unhexstring(res.substr(i + 1, 2))); ++i; } return res; } static string entity(char c) { switch (c) { case '>': return ">"; case '<': return "<"; case '&': return "&"; case '\'': return "'"; case '\"': return """; default: throw InternalErr(__FILE__, __LINE__, "Unrecognized character."); } } // Assumption: There are always exactly two octal digits in the input // and two hex digits in the result. string octal_to_hex(const string &octal_digits) { int val; istringstream ss(octal_digits); ss >> oct >> val; ostringstream ds; ds << hex << setw(2) << setfill('0') << val; return ds.str(); } /** Replace characters that are not allowed in XML @param in The string in which to replace characters. @param not_allowed The set of characters that are not allowed in XML. default: ><&'(single quote)"(double quote) @return The modified identifier. */ string id2xml(string in, const string ¬_allowed) { string::size_type i = 0; while ((i = in.find_first_of(not_allowed, i)) != string::npos) { in.replace(i, 1, entity(in[i])); ++i; } #if 0 // Removed the encoding of octal escapes. This function is used by // AttrTable to encode the stuff that is the value of the // element in the DDX. The problem is that some of the values are not // valid UTF-8 and that makes a XML parser gag.; ticket 1512. // jhrg 3/19/10 // OK, now scan for octal escape sequences like \\012 (where the '\' // is itself escaped). This type of attribute value comes from the netCDF // handler and maybe others. Assumption: The '\' will always appear as // in its escaped form: '\\'. NB: Both backslashes must be escaped in the // C++ string. string octal_escape = "\\\\"; i = 0; string::size_type length = in.length(); while ((i = in.find(octal_escape, i)) != string::npos) { // Get the three octal digits following the '\\0' string::size_type j = i + 2; if (j + 1 >= length) // Check that we're not past the end break; string octal_digits = in.substr(j, 3); // convert to a Ý XML escape string hex_escape = string("&#x"); hex_escape.append(octal_to_hex(octal_digits)); hex_escape.append(string(";")); // replace the octal escape with an XML/hex escape in.replace(i, 5, hex_escape); // increment i i += 6; } #endif return in; } /** Given a string that contains XML escape sequences (i.e., entities), translate those back into ASCII characters. Return the modified string. @param in The string to modify. @return The modified string. */ string xml2id(string in) { string::size_type i = 0; while ((i = in.find(">", i)) != string::npos) in.replace(i, 4, ">"); i = 0; while ((i = in.find("<", i)) != string::npos) in.replace(i, 4, "<"); i = 0; while ((i = in.find("&", i)) != string::npos) in.replace(i, 5, "&"); i = 0; while ((i = in.find("'", i)) != string::npos) in.replace(i, 6, "'"); i = 0; while ((i = in.find(""", i)) != string::npos) in.replace(i, 6, "\""); return in; } /** Return a string that has all the \c %<hex digit><hex digit> sequences replaced with underscores (`_'). @param s The string to transform @return The modified string. */ string esc2underscore(string s) { string::size_type pos; while ((pos = s.find('%')) != string::npos) s.replace(pos, 3, "_"); return s; } /** Escape non-printable characters and quotes from an HDF attribute. @param s The attribute to modify. @return The modified attribute. */ string escattr(string s) { const string printable = " ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789~`!@#$%^&*()_-+={[}]|\\:;<,>.?/'\""; const string ESC = "\\"; const string DOUBLE_ESC = ESC + ESC; const string QUOTE = "\""; const string ESCQUOTE = ESC + QUOTE; // escape \ with a second backslash string::size_type ind = 0; while ((ind = s.find(ESC, ind)) != s.npos) { s.replace(ind, 1, DOUBLE_ESC); ind += DOUBLE_ESC.length(); } // escape non-printing characters with octal escape ind = 0; while ((ind = s.find_first_not_of(printable, ind)) != s.npos) s.replace(ind, 1, ESC + octstring(s[ind])); // escape " with backslash ind = 0; while ((ind = s.find(QUOTE, ind)) != s.npos) { s.replace(ind, 1, ESCQUOTE); ind += ESCQUOTE.length(); } return s; } /** Un-escape special characters, quotes and backslashes from an HDF attribute. Note: A regex to match one \ must be defined as: Regex foo = "\\\\"; because both C++ strings and GNU's Regex also employ \ as an escape character! @param s The escaped attribute. @return The unescaped attribute. */ string unescattr(string s) { Regex octal("\\\\[0-3][0-7][0-7]"); // matches 4 characters Regex esc_quote("\\\\\""); // matches 3 characters Regex esc_esc("\\\\\\\\"); // matches 2 characters const string ESC = "\\"; const string QUOTE = "\""; int matchlen; unsigned int index; DBG(cerr << "0XX" << s << "XXX" << endl); // unescape any escaped backslashes index = esc_esc.search(s.c_str(), s.length(), matchlen, 0); while (index < s.length()) { DBG(cerr << "1aXX" << s << "XXX index: " << index << endl); s.replace(index, 2, ESC); DBG(cerr << "1bXX" << s << "XXX index: " << index << endl); index = esc_esc.search(s.c_str(), s.length(), matchlen, 0); } // unescape any escaped double quote characters index = esc_quote.search(s.c_str(), s.length(), matchlen, 0); while (index < s.length()) { s.replace(index, 2, QUOTE); DBG(cerr << "2XX" << s << "XXX index: " << index << endl); index = esc_quote.search(s.c_str(), s.length(), matchlen, 0); } // unescape octal characters index = octal.search(s.c_str(), s.length(), matchlen, 0); while (index < s.length()) { s.replace(index, 4, unoctstring(s.substr(index + 1, 3))); DBG(cerr << "3XX" << s << "XXX index: " << index << endl); index = octal.search(s.c_str(), s.length(), matchlen, 0); } DBG(cerr << "4XX" << s << "XXX" << endl); return s; } string munge_error_message(string msg) { // First, add enclosing quotes if needed. if (*msg.begin() != '"') msg.insert(msg.begin(), '"'); if (*(msg.end() - 1) != '"') msg += "\""; // Now escape any internal double quotes that aren't escaped. string::iterator miter; for (miter = msg.begin() + 1; miter != msg.end() - 1; miter++) if (*miter == '"' && *(miter - 1) != '\\') miter = msg.insert(miter, '\\'); return msg; } /** Rip through a string and replace all the double quotes with \" sequences. @param source @return result */ string escape_double_quotes(string source) { string::size_type idx = 0; while((idx = source.find('\"', idx)) != string::npos) { source.replace(idx, 1, "\\\""); // a backslash and a double quote idx += 2; } return source; } /** Rip through a string and replace all the escaped double quotes with regular double quotes. @param source @return result */ string unescape_double_quotes(string source) { string::size_type idx = 0; while((idx = source.find("\\\"", idx)) != string::npos) { source.replace(idx, 2, "\""); // a backslash and a double quote ++idx; } return source; } } // namespace libdap