summaryrefslogtreecommitdiffstats
path: root/include/iprt/latin1.h
blob: 074c1e65f97346f87a91aa39e16f6b12beb3120a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
/** @file
 * IPRT - String Manipulation, Latin-1 (ISO-8859-1) encoding.
 */

/*
 * Copyright (C) 2006-2022 Oracle and/or its affiliates.
 *
 * This file is part of VirtualBox base platform packages, as
 * available from https://www.virtualbox.org.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, in version 3 of the
 * License.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, see <https://www.gnu.org/licenses>.
 *
 * The contents of this file may alternatively be used under the terms
 * of the Common Development and Distribution License Version 1.0
 * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
 * in the VirtualBox distribution, in which case the provisions of the
 * CDDL are applicable instead of those of the GPL.
 *
 * You may elect to license modified versions of this file under the
 * terms and conditions of either the GPL or the CDDL or both.
 *
 * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
 */

#ifndef IPRT_INCLUDED_latin1_h
#define IPRT_INCLUDED_latin1_h
#ifndef RT_WITHOUT_PRAGMA_ONCE
# pragma once
#endif

#include <iprt/assert.h>
#include <iprt/errcore.h> /* VERR_END_OF_STRING */

RT_C_DECLS_BEGIN


/** @defgroup rt_str_latin1     Latin-1 (ISO-8859-1) String Manipulation
 * @ingroup grp_rt_str
 *
 * Deals with Latin-1 encoded strings.
 *
 * @warning Make sure to name all variables dealing with Latin-1 strings
 *          suchthat there is no way to mistake them for normal UTF-8 strings.
 *          There may be severe security issues resulting from mistaking Latin-1
 *          for UTF-8!
 *
 * @{
 */

/**
 * Get the unicode code point at the given string position.
 *
 * @returns unicode code point.
 * @returns RTUNICP_INVALID if the encoding is invalid.
 * @param   pszLatin1   The Latin-1 string.
 */
DECLINLINE(RTUNICP) RTLatin1GetCp(const char *pszLatin1)
{
    return *(const unsigned char *)pszLatin1;
}

/**
 * Get the unicode code point at the given string position.
 *
 * @returns iprt status code.
 * @param   ppszLatin1  Pointer to the string pointer. This will be updated to
 *                      point to the char following the current code point. This
 *                      is advanced one character forward on failure.
 * @param   pCp         Where to store the code point. RTUNICP_INVALID is stored
 *                      here on failure.
 */
DECLINLINE(int) RTLatin1GetCpEx(const char **ppszLatin1, PRTUNICP pCp)
{
    const unsigned char uch = **(const unsigned char **)ppszLatin1;
    (*ppszLatin1)++;
    *pCp = uch;
    return VINF_SUCCESS;
}

/**
 * Get the unicode code point at the given string position for a string of a
 * given maximum length.
 *
 * @returns iprt status code.
 * @retval  VERR_END_OF_STRING if *pcch is 0. *pCp is set to RTUNICP_INVALID.
 *
 * @param   ppszLatin1  Pointer to the string pointer. This will be updated to
 *                      point to the char following the current code point.
 * @param   pcchLatin1  Pointer to the maximum string length.  This will be
 *                      decremented by the size of the code point found.
 * @param   pCp         Where to store the code point.
 *                      RTUNICP_INVALID is stored here on failure.
 */
DECLINLINE(int) RTLatin1GetCpNEx(const char **ppszLatin1, size_t *pcchLatin1, PRTUNICP pCp)
{
    if (RT_LIKELY(*pcchLatin1 != 0))
    {
        const unsigned char uch = **(const unsigned char **)ppszLatin1;
        (*ppszLatin1)++;
        (*pcchLatin1)--;
        *pCp = uch;
        return VINF_SUCCESS;
    }
    *pCp = RTUNICP_INVALID;
    return VERR_END_OF_STRING;
}

/**
 * Get the Latin-1 size in characters of a given Unicode code point.
 *
 * The code point is expected to be a valid Unicode one, but not necessarily in
 * the range supported by Latin-1.
 *
 * @returns the size in characters, or zero if there is no Latin-1 encoding
 */
DECLINLINE(size_t) RTLatin1CpSize(RTUNICP CodePoint)
{
    if (CodePoint < 0x100)
        return 1;
    return 0;
}

/**
 * Put the unicode code point at the given string position
 * and return the pointer to the char following it.
 *
 * This function will not consider anything at or following the
 * buffer area pointed to by psz. It is therefore not suitable for
 * inserting code points into a string, only appending/overwriting.
 *
 * @returns pointer to the char following the written code point.
 * @param   pszLatin1   The string.
 * @param   CodePoint   The code point to write.
 *                      This should not be RTUNICP_INVALID or any other
 *                      character out of the Latin-1 range.
 */
DECLINLINE(char *) RTLatin1PutCp(char *pszLatin1, RTUNICP CodePoint)
{
    AssertReturn(CodePoint < 0x100, NULL);
    *pszLatin1++ = (unsigned char)CodePoint;
    return pszLatin1;
}

/**
 * Skips ahead, past the current code point.
 *
 * @returns Pointer to the char after the current code point.
 * @param   pszLatin1   Pointer to the current code point.
 * @remark  This will not move the next valid code point, only past the current one.
 */
DECLINLINE(char *) RTLatin1NextCp(const char *pszLatin1)
{
    pszLatin1++;
    return (char *)pszLatin1;
}

/**
 * Skips back to the previous code point.
 *
 * @returns Pointer to the char before the current code point.
 * @returns pszLatin1Start on failure.
 * @param   pszLatin1Start  Pointer to the start of the string.
 * @param   pszLatin1       Pointer to the current code point.
 */
DECLINLINE(char *) RTLatin1PrevCp(const char *pszLatin1Start, const char *pszLatin1)
{
    if ((uintptr_t)pszLatin1 > (uintptr_t)pszLatin1Start)
    {
        pszLatin1--;
        return (char *)pszLatin1;
    }
    return (char *)pszLatin1Start;
}

/**
 * Translate a Latin1 string into a UTF-8 allocating the result buffer (default
 * tag).
 *
 * @returns iprt status code.
 * @param   pszLatin1       Latin1 string to convert.
 * @param   ppszString      Receives pointer of allocated UTF-8 string on
 *                          success, and is always set to NULL on failure.
 *                          The returned pointer must be freed using RTStrFree().
 */
#define RTLatin1ToUtf8(pszLatin1, ppszString)       RTLatin1ToUtf8Tag((pszLatin1), (ppszString), RTSTR_TAG)

/**
 * Translate a Latin-1 string into a UTF-8 allocating the result buffer.
 *
 * @returns iprt status code.
 * @param   pszLatin1       Latin-1 string to convert.
 * @param   ppszString      Receives pointer of allocated UTF-8 string on
 *                          success, and is always set to NULL on failure.
 *                          The returned pointer must be freed using RTStrFree().
 * @param   pszTag          Allocation tag used for statistics and such.
 */
RTDECL(int)  RTLatin1ToUtf8Tag(const char *pszLatin1, char **ppszString, const char *pszTag);

/**
 * Translates Latin-1 to UTF-8 using buffer provided by the caller or a fittingly
 * sized buffer allocated by the function (default tag).
 *
 * @returns iprt status code.
 * @param   pszLatin1       The Latin-1 string to convert.
 * @param   cchLatin1       The number of Latin-1 characters to translate from
 *                          pszLatin1. The translation will stop when reaching
 *                          cchLatin1 or the terminator ('\\0').  Use RTSTR_MAX
 *                          to translate the entire string.
 * @param   ppsz            If @a cch is non-zero, this must either be pointing
 *                          to a pointer to a buffer of the specified size, or
 *                          pointer to a NULL pointer.  If *ppsz is NULL or
 *                          @a cch is zero a buffer of at least @a cch chars
 *                          will be allocated to hold the translated string. If
 *                          a buffer was requested it must be freed using
 *                          RTStrFree().
 * @param   cch             The buffer size in chars (the type). This includes the terminator.
 * @param   pcch            Where to store the length of the translated string,
 *                          excluding the terminator. (Optional)
 *
 *                          This may be set under some error conditions,
 *                          however, only for VERR_BUFFER_OVERFLOW and
 *                          VERR_NO_STR_MEMORY will it contain a valid string
 *                          length that can be used to resize the buffer.
 */
#define RTLatin1ToUtf8Ex(pszLatin1, cchLatin1, ppsz, cch, pcch) \
    RTLatin1ToUtf8ExTag((pszLatin1), (cchLatin1), (ppsz), (cch), (pcch), RTSTR_TAG)

/**
 * Translates Latin1 to UTF-8 using buffer provided by the caller or a fittingly
 * sized buffer allocated by the function (custom tag).
 *
 * @returns iprt status code.
 * @param   pszLatin1       The Latin1 string to convert.
 * @param   cchLatin1       The number of Latin1 characters to translate from
 *                          pwszString.  The translation will stop when
 *                          reaching cchLatin1 or the terminator ('\\0').  Use
 *                          RTSTR_MAX to translate the entire string.
 * @param   ppsz            If cch is non-zero, this must either be pointing to
 *                          a pointer to a buffer of the specified size, or
 *                          pointer to a NULL pointer.  If *ppsz is NULL or cch
 *                          is zero a buffer of at least cch chars will be
 *                          allocated to hold the translated string.  If a
 *                          buffer was requested it must be freed using
 *                          RTStrFree().
 * @param   cch             The buffer size in chars (the type).  This includes
 *                          the terminator.
 * @param   pcch            Where to store the length of the translated string,
 *                          excluding the terminator. (Optional)
 *
 *                          This may be set under some error conditions,
 *                          however, only for VERR_BUFFER_OVERFLOW and
 *                          VERR_NO_STR_MEMORY will it contain a valid string
 *                          length that can be used to resize the buffer.
 * @param   pszTag          Allocation tag used for statistics and such.
 */
RTDECL(int)  RTLatin1ToUtf8ExTag(const char *pszLatin1, size_t cchLatin1, char **ppsz, size_t cch, size_t *pcch,
                                 const char *pszTag);

/**
 * Calculates the length of the Latin-1 string in UTF-8 chars (bytes).
 *
 * The primary purpose of this function is to help allocate buffers for
 * RTLatin1ToUtf8() of the correct size. For most other purposes
 * RTLatin1ToUtf8Ex() should be used.
 *
 * @returns Number of chars (bytes).
 * @returns 0 if the string was incorrectly encoded.
 * @param   pszLatin1     The Latin-1 string.
 */
RTDECL(size_t) RTLatin1CalcUtf8Len(const char *pszLatin1);

/**
 * Calculates the length of the Latin-1 string in UTF-8 chars (bytes).
 *
 * @returns iprt status code.
 * @param   pszLatin1   The Latin-1 string.
 * @param   cchLatin1   The max string length. Use RTSTR_MAX to process the
 *                      entire string.
 * @param   pcch        Where to store the string length (in bytes).  Optional.
 *                      This is undefined on failure.
 */
RTDECL(int) RTLatin1CalcUtf8LenEx(const char *pszLatin1, size_t cchLatin1, size_t *pcch);

/**
 * Calculates the length of the Latin-1 (ISO-8859-1) string in RTUTF16 items.
 *
 * @returns Number of RTUTF16 items.
 * @param   pszLatin1       The Latin-1 string.
 */
RTDECL(size_t) RTLatin1CalcUtf16Len(const char *pszLatin1);

/**
 * Calculates the length of the Latin-1 (ISO-8859-1) string in RTUTF16 items.
 *
 * @returns iprt status code.
 * @param   pszLatin1       The Latin-1 string.
 * @param   cchLatin1       The max string length. Use RTSTR_MAX to process the
 *                          entire string.
 * @param   pcwc            Where to store the string length. Optional.
 *                          This is undefined on failure.
 */
RTDECL(int) RTLatin1CalcUtf16LenEx(const char *pszLatin1, size_t cchLatin1, size_t *pcwc);

/**
 * Translate a Latin-1 (ISO-8859-1) string into a UTF-16 allocating the result
 * buffer (default tag).
 *
 * @returns iprt status code.
 * @param   pszLatin1       The Latin-1 string to convert.
 * @param   ppwszString     Receives pointer to the allocated UTF-16 string. The
 *                          returned string must be freed using RTUtf16Free().
 */
#define RTLatin1ToUtf16(pszLatin1, ppwszString)     RTLatin1ToUtf16Tag((pszLatin1), (ppwszString), RTSTR_TAG)

/**
 * Translate a Latin-1 (ISO-8859-1) string into a UTF-16 allocating the result
 * buffer (custom tag).
 *
 * @returns iprt status code.
 * @param   pszLatin1       The Latin-1 string to convert.
 * @param   ppwszString     Receives pointer to the allocated UTF-16 string. The
 *                          returned string must be freed using RTUtf16Free().
 * @param   pszTag          Allocation tag used for statistics and such.
 */
RTDECL(int) RTLatin1ToUtf16Tag(const char *pszLatin1, PRTUTF16 *ppwszString, const char *pszTag);

/**
 * Translates pszLatin1 from Latin-1 (ISO-8859-1) to UTF-16, allocating the
 * result buffer if requested (default tag).
 *
 * @returns iprt status code.
 * @param   pszLatin1       The Latin-1 string to convert.
 * @param   cchLatin1       The maximum size in chars (the type) to convert. The
 *                          conversion stops when it reaches cchLatin1 or the
 *                          string terminator ('\\0'). Use RTSTR_MAX to
 *                          translate the entire string.
 * @param   ppwsz           If cwc is non-zero, this must either be pointing
 *                          to pointer to a buffer of the specified size, or
 *                          pointer to a NULL pointer.
 *                          If *ppwsz is NULL or cwc is zero a buffer of at
 *                          least cwc items will be allocated to hold the
 *                          translated string. If a buffer was requested it
 *                          must be freed using RTUtf16Free().
 * @param   cwc             The buffer size in RTUTF16s. This includes the
 *                          terminator.
 * @param   pcwc            Where to store the length of the translated string,
 *                          excluding the terminator. (Optional)
 *
 *                          This may be set under some error conditions,
 *                          however, only for VERR_BUFFER_OVERFLOW and
 *                          VERR_NO_STR_MEMORY will it contain a valid string
 *                          length that can be used to resize the buffer.
 */
#define RTLatin1ToUtf16Ex(pszLatin1, cchLatin1, ppwsz, cwc, pcwc) \
    RTLatin1ToUtf16ExTag((pszLatin1), (cchLatin1), (ppwsz), (cwc), (pcwc), RTSTR_TAG)

/**
 * Translates pszLatin1 from Latin-1 (ISO-8859-1) to UTF-16, allocating the
 * result buffer if requested.
 *
 * @returns iprt status code.
 * @param   pszLatin1       The Latin-1 string to convert.
 * @param   cchLatin1       The maximum size in chars (the type) to convert. The
 *                          conversion stops when it reaches cchLatin1 or the
 *                          string terminator ('\\0'). Use RTSTR_MAX to
 *                          translate the entire string.
 * @param   ppwsz           If cwc is non-zero, this must either be pointing
 *                          to pointer to a buffer of the specified size, or
 *                          pointer to a NULL pointer.
 *                          If *ppwsz is NULL or cwc is zero a buffer of at
 *                          least cwc items will be allocated to hold the
 *                          translated string. If a buffer was requested it
 *                          must be freed using RTUtf16Free().
 * @param   cwc             The buffer size in RTUTF16s. This includes the
 *                          terminator.
 * @param   pcwc            Where to store the length of the translated string,
 *                          excluding the terminator. (Optional)
 *
 *                          This may be set under some error conditions,
 *                          however, only for VERR_BUFFER_OVERFLOW and
 *                          VERR_NO_STR_MEMORY will it contain a valid string
 *                          length that can be used to resize the buffer.
 * @param   pszTag          Allocation tag used for statistics and such.
 */
RTDECL(int) RTLatin1ToUtf16ExTag(const char *pszLatin1, size_t cchLatin1,
                                 PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc, const char *pszTag);

/** @} */

RT_C_DECLS_END

#endif /* !IPRT_INCLUDED_latin1_h */