summaryrefslogtreecommitdiffstats
path: root/src/lib/util/strutil.h
blob: e5d2496a80e96437381f9e5f570cec373e0120fb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
// Copyright (C) 2011-2022 Internet Systems Consortium, Inc. ("ISC")
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

#ifndef STRUTIL_H
#define STRUTIL_H

#include <algorithm>
#include <cctype>
#include <stdint.h>
#include <string>
#include <iomanip>
#include <sstream>
#include <vector>
#include <exceptions/exceptions.h>
#include <boost/lexical_cast.hpp>
#include <boost/shared_ptr.hpp>

namespace isc {
namespace util {
namespace str {

/// @brief A Set of C++ Utilities for Manipulating Strings

///
/// @brief A standard string util exception that is thrown if getToken or
/// numToToken are called with bad input data
///
class StringTokenError : public Exception {
public:
    StringTokenError(const char* file, size_t line, const char* what) :
        isc::Exception(file, line, what) {}
};

/// @brief Normalize Backslash
///
/// Only relevant to Windows, this replaces all "\" in a string with "/"
/// and returns the result.  On other systems it is a no-op.  Note
/// that Windows does recognize file names with the "\" replaced by "/"
/// (at least in system calls, if not the command line).
///
/// @param name Name to be substituted
void normalizeSlash(std::string& name);

/// @brief Trim Leading and Trailing Spaces
///
/// Returns a copy of the input string but with any leading or trailing spaces
/// or tabs removed.
///
/// @param instring Input string to modify
///
/// @return String with leading and trailing spaces removed
std::string trim(const std::string& instring);

/// @brief Finds the "trimmed" end of a buffer
///
/// Works backward from the end of the buffer, looking for the first
/// character not equal to the trim value, and returns an iterator
/// pointing that that position.
///
/// @param begin - Forward iterator pointing to the beginning of the
/// buffer to trim
/// @param end - Forward iterator pointing to the untrimmed end of
/// the buffer to trim
/// @param trim_val - byte value to trim off
///
/// @return Iterator pointing the first character from the end of the
/// buffer not equal to the  trim value
template<typename Iterator>
Iterator
seekTrimmed(Iterator begin, Iterator end, uint8_t trim_val) {
    for ( ; end != begin && *(end - 1) == trim_val; --end);
    return(end);
}

/// @brief Split String into Tokens
///
/// Splits a string into tokens (the tokens being delimited by one or more of
/// the delimiter characters) and returns the tokens in a vector array. Note
/// that adjacent delimiters are considered to be a single delimiter.
///
/// Special cases are:
/// -# The empty string is considered to be zero tokens.
/// -# A string comprising nothing but delimiters is considered to be zero
///    tokens.
///
/// The reasoning behind this is that the string can be thought of as having
/// invisible leading and trailing delimiter characters.  Therefore both cases
/// reduce to a set of contiguous delimiters, which are considered a single
/// delimiter (so getting rid of the string).
/// Optional escape allows to escape delimiter characters (and *only* them
/// and the escape character itself) using backslash.
///
/// We could use Boost for this, but this (simple) function eliminates one
/// dependency in the code.
///
/// @param text String to be split.  Passed by value as the internal copy is
/// altered during the processing.
/// @param delim Delimiter characters
/// @param escape Use backslash to escape delimiter characters
///
/// @return Vector of tokens.
std::vector<std::string> tokens(const std::string& text,
        const std::string& delim = std::string(" \t\n"),
        bool escape = false);

/// @brief Uppercase Character
///
/// Used in uppercase() to pass as an argument to std::transform().  The
/// function std::toupper() can't be used as it takes an "int" as its argument;
/// this confuses the template expansion mechanism because dereferencing a
/// string::iterator returns a char.
///
/// @param chr Character to be upper-cased.
///
/// @return Uppercase version of the argument
inline char toUpper(char chr) {
    return (static_cast<char>(std::toupper(static_cast<int>(chr))));
}

/// @brief Uppercase String
///
/// A convenience function to uppercase a string.
///
/// @param text String to be upper-cased.
inline void uppercase(std::string& text) {
    std::transform(text.begin(), text.end(), text.begin(),
        isc::util::str::toUpper);
}

/// @brief Lowercase Character
///
/// Used in lowercase() to pass as an argument to std::transform().  The
/// function std::tolower() can't be used as it takes an "int" as its argument;
/// this confuses the template expansion mechanism because dereferencing a
/// string::iterator returns a char.
///
/// @param chr Character to be lower-cased.
///
/// @return Lowercase version of the argument
inline char toLower(char chr) {
    return (static_cast<char>(std::tolower(static_cast<int>(chr))));
}

/// @brief Lowercase String
///
/// A convenience function to lowercase a string
///
/// @param text String to be lower-cased.
inline void lowercase(std::string& text) {
    std::transform(text.begin(), text.end(), text.begin(),
        isc::util::str::toLower);
}

/// @brief Apply Formatting
///
/// Given a printf-style format string containing only "%s" place holders
/// (others are ignored) and a vector of strings, this produces a single string
/// with the placeholders replaced.
///
/// @param format Format string
/// @param args Vector of argument strings
///
/// @return Resultant string
std::string format(const std::string& format,
    const std::vector<std::string>& args);


/// @brief Returns one token from the given stringstream
///
/// Using the >> operator, with basic error checking
///
/// @throw StringTokenError if the token cannot be read from the stream
///
/// @param iss stringstream to read one token from
///
/// @return the first token read from the stringstream
std::string getToken(std::istringstream& iss);

/// @brief Converts a string token to an *unsigned* integer.
///
/// The value is converted using a lexical cast, with error and bounds
/// checking.
///
/// NumType is a *signed* integral type (e.g. int32_t) that is sufficiently
/// wide to store resulting integers.
///
/// BitSize is the maximum number of bits that the resulting integer can take.
/// This function first checks whether the given token can be converted to
/// an integer of NumType type.  It then confirms the conversion result is
/// within the valid range, i.e., [0, 2^BitSize - 1].  The second check is
/// necessary because lexical_cast<T> where T is an unsigned integer type
/// doesn't correctly reject negative numbers when compiled with SunStudio.
///
/// @throw StringTokenError if the value is out of range, or if it
///        could not be converted
///
/// @param num_token the string token to convert
///
/// @return the converted value, of type NumType
template <typename NumType, int BitSize>
NumType
tokenToNum(const std::string& num_token) {
    NumType num;
    try {
        num = boost::lexical_cast<NumType>(num_token);
    } catch (const boost::bad_lexical_cast&) {
        isc_throw(StringTokenError, "Invalid SRV numeric parameter: " <<
                  num_token);
    }
    if (num < 0 || num >= (static_cast<NumType>(1) << BitSize)) {
        isc_throw(StringTokenError, "Numeric SRV parameter out of range: " <<
                  num);
    }
    return (num);
}

/// @brief Converts a string in quotes into vector.
///
/// A converted string is first trimmed. If a trimmed string is in
/// quotes, the quotes are removed and the resulting string is copied
/// into a vector. If the string is not in quotes, an empty vector is
/// returned.
///
/// The resulting string is copied to a vector and returned.
///
/// This function is intended to be used by the server configuration
/// parsers to convert string values surrounded with quotes into
/// binary form.
///
/// @param quoted_string String to be converted.
///
/// @return Vector containing converted string or empty string if
/// input string didn't contain expected quote characters.
std::vector<uint8_t>
quotedStringToBinary(const std::string& quoted_string);

/// @brief Converts a string of separated hexadecimal digits
/// into a vector.
///
/// Octets may contain 1 or 2 digits. For example, using a colon
/// for a separator all of the following are valid:
///
/// - yy:yy:yy:yy:yy
/// - y:y:y:y:y
/// - y:yy:yy:y:y
///
/// If the decoded string doesn't match any of the supported formats,
/// an exception is thrown.
///
/// @param hex_string Input string.
/// @param sep character to use as a separator.
/// @param binary Vector receiving converted string into binary.
///
/// @throw isc::BadValue if the format of the input string is invalid.
void
decodeSeparatedHexString(const std::string& hex_string,
                         const std::string& sep,
                         std::vector<uint8_t>& binary);

/// @brief Converts a string of hexadecimal digits with colons into
///  a vector.
///
/// Convenience method which calls @c decodeSeparatedHexString() passing
/// in a colon for the separator.

/// @param hex_string Input string.
/// @param binary Vector receiving converted string into binary.
///
/// @throw isc::BadValue if the format of the input string is invalid.
void
decodeColonSeparatedHexString(const std::string& hex_string,
                              std::vector<uint8_t>& binary);

/// @brief Converts a formatted string of hexadecimal digits into
/// a vector.
///
/// This function supports the following formats:
///
/// - yy:yy:yy:yy or yy yy yy yy - octets delimited by colons or
/// spaces, see @c decodeSeparatedHexString
///
/// - yyyyyyyyyy
/// - 0xyyyyyyyyyy
///
/// If there is an odd number of hexadecimal digits in the input
/// string, the '0' is prepended to the string before decoding.
///
/// @param hex_string Input string.
/// @param binary Vector receiving converted string into binary.
///
/// @throw isc::BadValue if the format of the input string is invalid.
void
decodeFormattedHexString(const std::string& hex_string,
                         std::vector<uint8_t>& binary);

/// @brief Forward declaration to the @c StringSanitizer implementation.
class StringSanitizerImpl;

/// @brief Type representing the pointer to the @c StringSanitizerImpl.
typedef boost::shared_ptr<StringSanitizerImpl> StringSanitizerImplPtr;

/// @brief Implements a regular expression based string scrubber
///
/// The implementation uses C++11 regex IF the environment supports it
/// (tested in configure.ac). If not it falls back to C lib regcomp/regexec.
/// Older compilers, such as pre Gnu g++ 4.9.0, provided only experimental
/// implementations of regex which are recognized as buggy.
class StringSanitizer {
public:

    /// @brief Constructor.
    ///
    /// Compiles the given character set into a regular expression, and
    /// retains the given character replacement. Thereafter, the instance
    /// may be used to scrub an arbitrary number of strings.
    ///
    /// @param char_set string containing a regular expression (POSIX
    /// extended syntax) that describes the characters to replace.  If you
    /// wanted to sanitize hostnames for example, you could specify the
    /// inversion of valid characters "[^A-Za-z0-9_-]".
    /// @param char_replacement string of one or more characters to use as the
    /// replacement for invalid characters.
    ///
    /// @throw BadValue if given an invalid regular expression
    StringSanitizer(const std::string& char_set,
                    const std::string& char_replacement);

    /// @brief Destructor.
    ///
    /// Destroys the implementation instance.
    ~StringSanitizer();

    /// Returns a scrubbed copy of a given string
    ///
    /// Replaces all occurrences of characters described by the regular
    /// expression with the character replacement.
    ///
    /// @param original the string to scrub
    ///
    /// @throw Unexpected if an error occurs during scrubbing
    std::string scrub(const std::string& original);

    /// @brief The maximum size for regex parameters.
    ///
    /// @note The regex engine is implemented using recursion and can cause
    /// stack overflow if the input data is too large. An arbitrary size of
    /// 4096 should be enough for all cases.
    static const uint32_t MAX_DATA_SIZE;

private:
    /// @brief Pointer to the @c StringSanitizerImpl.
    StringSanitizerImplPtr impl_;
};

/// @brief Type representing the pointer to the @c StringSanitizer.
typedef boost::shared_ptr<StringSanitizer> StringSanitizerPtr;

/// @brief Check if a string is printable
///
/// @param content String to check for printable characters
///
/// @return True if empty or contains only printable characters, False otherwise
inline bool
isPrintable(const std::string& content) {
    for (const auto& ch : content) {
        if (isprint(static_cast<int>(ch)) == 0) {
            return (false);
        }
    }
    return (true);
}

/// @brief Check if a byte vector is printable
///
/// @param content Vector to check for printable characters
///
/// @return True if empty or contains only printable characters, False otherwise
inline bool
isPrintable(const std::vector<uint8_t>& content) {
    for (const auto& ch : content) {
        if (isprint(static_cast<int>(ch)) == 0) {
            return (false);
        }
    }
    return (true);
}


/// @brief Dumps a buffer of bytes as a string of hexadecimal digits
///
/// @param data pointer to the data to dump
/// @param length number of bytes to dump. Caller should ensure the length
/// does not exceed the buffer.
std::string dumpAsHex(const uint8_t* data, size_t length);

} // namespace str
} // namespace util
} // namespace isc

#endif // STRUTIL_H