1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
|
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/*
* UTF-8-related functionality, including a type-safe structure representing a
* UTF-8 code unit.
*/
#ifndef mozilla_Utf8_h
#define mozilla_Utf8_h
#include "mozilla/Casting.h" // for mozilla::AssertedCast
#include "mozilla/Likely.h" // for MOZ_UNLIKELY
#include "mozilla/Maybe.h" // for mozilla::Maybe
#include "mozilla/Span.h" // for mozilla::Span
#include "mozilla/TextUtils.h" // for mozilla::IsAscii and via Latin1.h for
// encoding_rs_mem.h and MOZ_HAS_JSRUST.
#include "mozilla/Tuple.h" // for mozilla::Tuple
#include "mozilla/Types.h" // for MFBT_API
#include <limits> // for CHAR_BIT / std::numeric_limits
#include <stddef.h> // for size_t
#include <stdint.h> // for uint8_t
#if MOZ_HAS_JSRUST()
// Can't include mozilla/Encoding.h here.
extern "C" {
// Declared as uint8_t instead of char to match declaration in another header.
size_t encoding_utf8_valid_up_to(uint8_t const* buffer, size_t buffer_len);
}
#else
namespace mozilla {
namespace detail {
extern MFBT_API bool IsValidUtf8(const void* aCodeUnits, size_t aCount);
}; // namespace detail
}; // namespace mozilla
#endif // MOZ_HAS_JSRUST
namespace mozilla {
union Utf8Unit;
static_assert(CHAR_BIT == 8,
"Utf8Unit won't work so well with non-octet chars");
/**
* A code unit within a UTF-8 encoded string. (A code unit is the smallest
* unit within the Unicode encoding of a string. For UTF-8 this is an 8-bit
* number; for UTF-16 it would be a 16-bit number.)
*
* This is *not* the same as a single code point: in UTF-8, non-ASCII code
* points are constituted by multiple code units.
*/
union Utf8Unit {
private:
// Utf8Unit is a union wrapping a raw |char|. The C++ object model and C++
// requirements as to how objects may be accessed with respect to their actual
// types (almost?) uniquely compel this choice.
//
// Our requirements for a UTF-8 code unit representation are:
//
// 1. It must be "compatible" with C++ character/string literals that use
// the UTF-8 encoding. Given a properly encoded C++ literal, you should
// be able to use |Utf8Unit| and friends to access it; given |Utf8Unit|
// and friends (particularly UnicodeData), you should be able to access
// C++ character types for their contents.
// 2. |Utf8Unit| and friends must convert to/from |char| and |char*| only by
// explicit operation.
// 3. |Utf8Unit| must participate in overload resolution and template type
// equivalence (that is, given |template<class> class X|, when |X<T>| and
// |X<U>| are the same type) distinctly from the C++ character types.
//
// And a few nice-to-haves (at least for the moment):
//
// 4. The representation should use unsigned numbers, to avoid undefined
// behavior that can arise with signed types, and because Unicode code
// points and code units are unsigned.
// 5. |Utf8Unit| and friends should be convertible to/from |unsigned char|
// and |unsigned char*|, for APIs that (because of #4 above) use those
// types as the "natural" choice for UTF-8 data.
//
// #1 requires that |Utf8Unit| "incorporate" a C++ character type: one of
// |{,{un,}signed} char|.[0] |uint8_t| won't work because it might not be a
// C++ character type.
//
// #2 and #3 mean that |Utf8Unit| can't *be* such a type (or a typedef to one:
// typedefs don't generate *new* types, just type aliases). This requires a
// compound type.
//
// The ultimate representation (and character type in it) is constrained by
// C++14 [basic.lval]p10 that defines how objects may be accessed, with
// respect to the dynamic type in memory and the actual type used to access
// them. It reads:
//
// If a program attempts to access the stored value of an object
// through a glvalue of other than one of the following types the
// behavior is undefined:
//
// 1. the dynamic type of the object,
// 2. a cv-qualified version of the dynamic type of the object,
// ...other types irrelevant here...
// 3. an aggregate or union type that includes one of the
// aforementioned types among its elements or non-static data
// members (including, recursively, an element or non-static
// data member of a subaggregate or contained union),
// ...more irrelevant types...
// 4. a char or unsigned char type.
//
// Accessing (wrapped) UTF-8 data as |char|/|unsigned char| is allowed no
// matter the representation by #4. (Briefly set aside what values are seen.)
// (And #2 allows |const| on either the dynamic type or the accessing type.)
// (|signed char| is really only useful for small signed numbers, not
// characters, so we ignore it.)
//
// If we interpret contents as |char|/|unsigned char| contrary to the actual
// type stored there, what happens? C++14 [basic.fundamental]p1 requires
// character types be identically aligned/sized; C++14 [basic.fundamental]p3
// requires |signed char| and |unsigned char| have the same value
// representation. C++ doesn't require identical bitwise representation, tho.
// Practically we could assume it, but this verges on C++ spec bits best not
// *relied* on for correctness, if possible.
//
// So we don't expose |Utf8Unit|'s contents as |unsigned char*|: only |char|
// and |char*|. Instead we safely expose |unsigned char| by fully-defined
// *integral conversion* (C++14 [conv.integral]p2). Integral conversion from
// |unsigned char| → |char| has only implementation-defined behavior. It'd be
// better not to depend on that, but given twos-complement won, it should be
// okay. (Also |unsigned char*| is awkward enough to work with for strings
// that it probably doesn't appear in string manipulation much anyway, only in
// places that should really use |Utf8Unit| directly.)
//
// The opposite direction -- interpreting |char| or |char*| data through
// |Utf8Unit| -- isn't tricky as long as |Utf8Unit| contains a |char| as
// decided above, using #3. An "aggregate or union" will work that contains a
// |char|. Oddly, an aggregate won't work: C++14 [dcl.init.aggr]p1 says
// aggregates must have "no private or protected non-static data members", and
// we want to keep the inner |char| hidden. So a |struct| is out, and only
// |union| remains.
//
// (Enums are not "an aggregate or union type", so [maybe surprisingly] we
// can't make |Utf8Unit| an enum class with |char| underlying type, because we
// are given no license to treat |char| memory as such an |enum|'s memory.)
//
// Therefore |Utf8Unit| is a union type with a |char| non-static data member.
// This satisfies all our requirements. It also supports the nice-to-haves of
// creating a |Utf8Unit| from an |unsigned char|, and being convertible to
// |unsigned char|. It doesn't satisfy the nice-to-haves of using an
// |unsigned char| internally, nor of letting us wrap an existing
// |unsigned char| or pointer to one. We probably *could* do these, if we
// were willing to rely harder on implementation-defined behaviors, but for
// now we privilege C++'s main character type over some conceptual purity.
//
// 0. There's a proposal for a UTF-8 character type distinct from the existing
// C++ narrow character types:
//
// http://open-std.org/JTC1/SC22/WG21/docs/papers/2016/p0482r0.html
//
// but it hasn't been standardized (and might never be), and none of the
// compilers we really care about have implemented it. Maybe someday we
// can change our implementation to it without too much trouble, if we're
// lucky...
char mValue = '\0';
public:
Utf8Unit() = default;
explicit constexpr Utf8Unit(char aUnit) : mValue(aUnit) {}
explicit constexpr Utf8Unit(unsigned char aUnit)
: mValue(static_cast<char>(aUnit)) {
// Per the above comment, the prior cast is integral conversion with
// implementation-defined semantics, and we regretfully but unavoidably
// assume the conversion does what we want it to.
}
constexpr bool operator==(const Utf8Unit& aOther) const {
return mValue == aOther.mValue;
}
constexpr bool operator!=(const Utf8Unit& aOther) const {
return !(*this == aOther);
}
/** Convert a UTF-8 code unit to a raw char. */
constexpr char toChar() const {
// Only a |char| is ever permitted to be written into this location, so this
// is both permissible and returns the desired value.
return mValue;
}
/** Convert a UTF-8 code unit to a raw unsigned char. */
constexpr unsigned char toUnsignedChar() const {
// Per the above comment, this is well-defined integral conversion.
return static_cast<unsigned char>(mValue);
}
/** Convert a UTF-8 code unit to a uint8_t. */
constexpr uint8_t toUint8() const {
// Per the above comment, this is well-defined integral conversion.
return static_cast<uint8_t>(mValue);
}
// We currently don't expose |&mValue|. |UnicodeData| sort of does, but
// that's a somewhat separate concern, justified in different comments in
// that other code.
};
/**
* Reinterpret the address of a UTF-8 code unit as |const unsigned char*|.
*
* Assuming proper backing has been set up, the resulting |const unsigned char*|
* may validly be dereferenced.
*
* No access is provided to mutate this underlying memory as |unsigned char|.
* Presently memory inside |Utf8Unit| is *only* stored as |char|, and we are
* loath to offer a way to write non-|char| data until absolutely necessary.
*/
inline const unsigned char* Utf8AsUnsignedChars(const Utf8Unit* aUnits) {
static_assert(sizeof(Utf8Unit) == sizeof(unsigned char),
"sizes must match to permissibly reinterpret_cast<>");
static_assert(alignof(Utf8Unit) == alignof(unsigned char),
"alignment must match to permissibly reinterpret_cast<>");
// The static_asserts above only enable the reinterpret_cast<> to occur.
//
// Dereferencing the resulting pointer is a separate question. Any object's
// memory may be interpreted as |unsigned char| per C++11 [basic.lval]p10, but
// this doesn't guarantee what values will be observed. If |char| is
// implemented to act like |unsigned char|, we're good to go: memory for the
// |char| in |Utf8Unit| acts as we need. But if |char| is implemented to act
// like |signed char|, dereferencing produces the right value only if the
// |char| types all use two's-complement representation. Every modern
// compiler does this, and there's a C++ proposal to standardize it.
// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0907r0.html So
// *technically* this is implementation-defined -- but everyone does it and
// this behavior is being standardized.
return reinterpret_cast<const unsigned char*>(aUnits);
}
/** Returns true iff |aUnit| is an ASCII value. */
constexpr bool IsAscii(Utf8Unit aUnit) {
return IsAscii(aUnit.toUnsignedChar());
}
/**
* Return true if the given span of memory consists of a valid UTF-8
* string and false otherwise.
*
* The string *may* contain U+0000 NULL code points.
*/
inline bool IsUtf8(mozilla::Span<const char> aString) {
#if MOZ_HAS_JSRUST()
size_t length = aString.Length();
const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.Elements());
// For short strings, the function call is a pessimization, and the SIMD
// code won't have a chance to kick in anyway.
if (length < 16) {
for (size_t i = 0; i < length; i++) {
if (ptr[i] >= 0x80U) {
ptr += i;
length -= i;
goto end;
}
}
return true;
}
end:
return length == encoding_utf8_valid_up_to(ptr, length);
#else
return detail::IsValidUtf8(aString.Elements(), aString.Length());
#endif
}
#if MOZ_HAS_JSRUST()
// See Latin1.h for conversions between Latin1 and UTF-8.
/**
* Returns the index of the start of the first malformed byte
* sequence or the length of the string if there are none.
*/
inline size_t Utf8ValidUpTo(mozilla::Span<const char> aString) {
return encoding_utf8_valid_up_to(
reinterpret_cast<const uint8_t*>(aString.Elements()), aString.Length());
}
/**
* Converts potentially-invalid UTF-16 to UTF-8 replacing lone surrogates
* with the REPLACEMENT CHARACTER.
*
* The length of aDest must be at least the length of aSource times three.
*
* Returns the number of code units written.
*/
inline size_t ConvertUtf16toUtf8(mozilla::Span<const char16_t> aSource,
mozilla::Span<char> aDest) {
return encoding_mem_convert_utf16_to_utf8(
aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
}
/**
* Converts potentially-invalid UTF-8 to UTF-16 replacing malformed byte
* sequences with the REPLACEMENT CHARACTER with potentially insufficient
* output space.
*
* Returns the number of code units read and the number of bytes written.
*
* If the output isn't large enough, not all input is consumed.
*
* The conversion is guaranteed to be complete if the length of aDest is
* at least the length of aSource times three.
*
* The output is always valid UTF-8 ending on scalar value boundary
* even in the case of partial conversion.
*
* The semantics of this function match the semantics of
* TextEncoder.encodeInto.
* https://encoding.spec.whatwg.org/#dom-textencoder-encodeinto
*/
inline mozilla::Tuple<size_t, size_t> ConvertUtf16toUtf8Partial(
mozilla::Span<const char16_t> aSource, mozilla::Span<char> aDest) {
size_t srcLen = aSource.Length();
size_t dstLen = aDest.Length();
encoding_mem_convert_utf16_to_utf8_partial(aSource.Elements(), &srcLen,
aDest.Elements(), &dstLen);
return mozilla::MakeTuple(srcLen, dstLen);
}
/**
* Converts potentially-invalid UTF-8 to UTF-16 replacing malformed byte
* sequences with the REPLACEMENT CHARACTER.
*
* Returns the number of code units written.
*
* The length of aDest must be at least one greater than the length of aSource
* even though the last slot isn't written to.
*
* If you know that the input is valid for sure, use
* UnsafeConvertValidUtf8toUtf16() instead.
*/
inline size_t ConvertUtf8toUtf16(mozilla::Span<const char> aSource,
mozilla::Span<char16_t> aDest) {
return encoding_mem_convert_utf8_to_utf16(
aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
}
/**
* Converts known-valid UTF-8 to UTF-16. If the input might be invalid,
* use ConvertUtf8toUtf16() or ConvertUtf8toUtf16WithoutReplacement() instead.
*
* Returns the number of code units written.
*
* The length of aDest must be at least the length of aSource.
*/
inline size_t UnsafeConvertValidUtf8toUtf16(mozilla::Span<const char> aSource,
mozilla::Span<char16_t> aDest) {
return encoding_mem_convert_utf8_to_utf16(
aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
}
/**
* Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error.
*
* Returns the number of code units written or `mozilla::Nothing` if the
* input was invalid.
*
* The length of the destination buffer must be at least the length of the
* source buffer.
*
* When the input was invalid, some output may have been written.
*
* If you know that the input is valid for sure, use
* UnsafeConvertValidUtf8toUtf16() instead.
*/
inline mozilla::Maybe<size_t> ConvertUtf8toUtf16WithoutReplacement(
mozilla::Span<const char> aSource, mozilla::Span<char16_t> aDest) {
size_t written = encoding_mem_convert_utf8_to_utf16_without_replacement(
aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
if (MOZ_UNLIKELY(written == std::numeric_limits<size_t>::max())) {
return mozilla::Nothing();
}
return mozilla::Some(written);
}
#endif // MOZ_HAS_JSRUST
/**
* Returns true iff |aUnit| is a UTF-8 trailing code unit matching the pattern
* 0b10xx'xxxx.
*/
inline bool IsTrailingUnit(Utf8Unit aUnit) {
return (aUnit.toUint8() & 0b1100'0000) == 0b1000'0000;
}
/**
* Given |aLeadUnit| that is a non-ASCII code unit, a pointer to an |Iter aIter|
* that (initially) itself points one unit past |aLeadUnit|, and
* |const EndIter& aEnd| that denotes the end of the UTF-8 data when compared
* against |*aIter| using |aEnd - *aIter|:
*
* If |aLeadUnit| and subsequent code units computed using |*aIter| (up to
* |aEnd|) encode a valid code point -- not exceeding Unicode's range, not a
* surrogate, in shortest form -- then return Some(that code point) and advance
* |*aIter| past those code units.
*
* Otherwise decrement |*aIter| (so that it points at |aLeadUnit|) and return
* Nothing().
*
* |Iter| and |EndIter| are generalized concepts most easily understood as if
* they were |const char*|, |const unsigned char*|, or |const Utf8Unit*|:
* iterators that when dereferenced can be used to construct a |Utf8Unit| and
* that can be compared and modified in certain limited ways. (Carefully note
* that this function mutates |*aIter|.) |Iter| and |EndIter| are template
* parameters to support more-complicated adaptor iterators.
*
* The template parameters after |Iter| allow users to implement custom handling
* for various forms of invalid UTF-8. A version of this function that defaults
* all such handling to no-ops is defined below this function. To learn how to
* define your own custom handling, consult the implementation of that function,
* which documents exactly how custom handler functors are invoked.
*
* This function is MOZ_ALWAYS_INLINE: if you don't need that, use the version
* of this function without the "Inline" suffix on the name.
*/
template <typename Iter, typename EndIter, class OnBadLeadUnit,
class OnNotEnoughUnits, class OnBadTrailingUnit, class OnBadCodePoint,
class OnNotShortestForm>
MOZ_ALWAYS_INLINE Maybe<char32_t> DecodeOneUtf8CodePointInline(
const Utf8Unit aLeadUnit, Iter* aIter, const EndIter& aEnd,
OnBadLeadUnit aOnBadLeadUnit, OnNotEnoughUnits aOnNotEnoughUnits,
OnBadTrailingUnit aOnBadTrailingUnit, OnBadCodePoint aOnBadCodePoint,
OnNotShortestForm aOnNotShortestForm) {
MOZ_ASSERT(Utf8Unit((*aIter)[-1]) == aLeadUnit);
char32_t n = aLeadUnit.toUint8();
MOZ_ASSERT(!IsAscii(n));
// |aLeadUnit| determines the number of trailing code units in the code point
// and the bits of |aLeadUnit| that contribute to the code point's value.
uint8_t remaining;
uint32_t min;
if ((n & 0b1110'0000) == 0b1100'0000) {
remaining = 1;
min = 0x80;
n &= 0b0001'1111;
} else if ((n & 0b1111'0000) == 0b1110'0000) {
remaining = 2;
min = 0x800;
n &= 0b0000'1111;
} else if ((n & 0b1111'1000) == 0b1111'0000) {
remaining = 3;
min = 0x10000;
n &= 0b0000'0111;
} else {
*aIter -= 1;
aOnBadLeadUnit();
return Nothing();
}
// If the code point would require more code units than remain, the encoding
// is invalid.
auto actual = aEnd - *aIter;
if (MOZ_UNLIKELY(actual < remaining)) {
*aIter -= 1;
aOnNotEnoughUnits(AssertedCast<uint8_t>(actual + 1), remaining + 1);
return Nothing();
}
for (uint8_t i = 0; i < remaining; i++) {
const Utf8Unit unit(*(*aIter)++);
// Every non-leading code unit in properly encoded UTF-8 has its high
// bit set and the next-highest bit unset.
if (MOZ_UNLIKELY(!IsTrailingUnit(unit))) {
uint8_t unitsObserved = i + 1 + 1;
*aIter -= unitsObserved;
aOnBadTrailingUnit(unitsObserved);
return Nothing();
}
// The code point being encoded is the concatenation of all the
// unconstrained bits.
n = (n << 6) | (unit.toUint8() & 0b0011'1111);
}
// UTF-16 surrogates and values outside the Unicode range are invalid.
if (MOZ_UNLIKELY(n > 0x10FFFF || (0xD800 <= n && n <= 0xDFFF))) {
uint8_t unitsObserved = remaining + 1;
*aIter -= unitsObserved;
aOnBadCodePoint(n, unitsObserved);
return Nothing();
}
// Overlong code points are also invalid.
if (MOZ_UNLIKELY(n < min)) {
uint8_t unitsObserved = remaining + 1;
*aIter -= unitsObserved;
aOnNotShortestForm(n, unitsObserved);
return Nothing();
}
return Some(n);
}
/**
* Identical to the above function, but not forced to be instantiated inline --
* the compiler is permitted to common up separate invocations if it chooses.
*/
template <typename Iter, typename EndIter, class OnBadLeadUnit,
class OnNotEnoughUnits, class OnBadTrailingUnit, class OnBadCodePoint,
class OnNotShortestForm>
inline Maybe<char32_t> DecodeOneUtf8CodePoint(
const Utf8Unit aLeadUnit, Iter* aIter, const EndIter& aEnd,
OnBadLeadUnit aOnBadLeadUnit, OnNotEnoughUnits aOnNotEnoughUnits,
OnBadTrailingUnit aOnBadTrailingUnit, OnBadCodePoint aOnBadCodePoint,
OnNotShortestForm aOnNotShortestForm) {
return DecodeOneUtf8CodePointInline(aLeadUnit, aIter, aEnd, aOnBadLeadUnit,
aOnNotEnoughUnits, aOnBadTrailingUnit,
aOnBadCodePoint, aOnNotShortestForm);
}
/**
* Like the always-inlined function above, but with no-op behavior from all
* trailing if-invalid notifier functors.
*
* This function is MOZ_ALWAYS_INLINE: if you don't need that, use the version
* of this function without the "Inline" suffix on the name.
*/
template <typename Iter, typename EndIter>
MOZ_ALWAYS_INLINE Maybe<char32_t> DecodeOneUtf8CodePointInline(
const Utf8Unit aLeadUnit, Iter* aIter, const EndIter& aEnd) {
// aOnBadLeadUnit is called when |aLeadUnit| itself is an invalid lead unit in
// a multi-unit code point. It is passed no arguments: the caller already has
// |aLeadUnit| on hand, so no need to provide it again.
auto onBadLeadUnit = []() {};
// aOnNotEnoughUnits is called when |aLeadUnit| properly indicates a code
// point length, but there aren't enough units from |*aIter| to |aEnd| to
// satisfy that length. It is passed the number of code units actually
// available (according to |aEnd - *aIter|) and the number of code units that
// |aLeadUnit| indicates are needed. Both numbers include the contribution
// of |aLeadUnit| itself: so |aUnitsAvailable <= 3|, |aUnitsNeeded <= 4|, and
// |aUnitsAvailable < aUnitsNeeded|. As above, it also is not passed the lead
// code unit.
auto onNotEnoughUnits = [](uint8_t aUnitsAvailable, uint8_t aUnitsNeeded) {};
// aOnBadTrailingUnit is called when one of the trailing code units implied by
// |aLeadUnit| doesn't match the 0b10xx'xxxx bit pattern that all UTF-8
// trailing code units must satisfy. It is passed the total count of units
// observed (including |aLeadUnit|). The bad trailing code unit will
// conceptually be at |(*aIter)[aUnitsObserved - 1]| if this functor is
// called, and so |aUnitsObserved <= 4|.
auto onBadTrailingUnit = [](uint8_t aUnitsObserved) {};
// aOnBadCodePoint is called when a structurally-correct code point encoding
// is found, but the *value* that is encoded is not a valid code point: either
// because it exceeded the U+10FFFF Unicode maximum code point, or because it
// was a UTF-16 surrogate. It is passed the non-code point value and the
// number of code units used to encode it.
auto onBadCodePoint = [](char32_t aBadCodePoint, uint8_t aUnitsObserved) {};
// aOnNotShortestForm is called when structurally-correct encoding is found,
// but the encoded value should have been encoded in fewer code units (e.g.
// mis-encoding U+0000 as 0b1100'0000 0b1000'0000 in two code units instead of
// as 0b0000'0000). It is passed the mis-encoded code point (which will be
// valid and not a surrogate) and the count of code units that mis-encoded it.
auto onNotShortestForm = [](char32_t aBadCodePoint, uint8_t aUnitsObserved) {
};
return DecodeOneUtf8CodePointInline(aLeadUnit, aIter, aEnd, onBadLeadUnit,
onNotEnoughUnits, onBadTrailingUnit,
onBadCodePoint, onNotShortestForm);
}
/**
* Identical to the above function, but not forced to be instantiated inline --
* the compiler/linker are allowed to common up separate invocations.
*/
template <typename Iter, typename EndIter>
inline Maybe<char32_t> DecodeOneUtf8CodePoint(const Utf8Unit aLeadUnit,
Iter* aIter,
const EndIter& aEnd) {
return DecodeOneUtf8CodePointInline(aLeadUnit, aIter, aEnd);
}
} // namespace mozilla
#endif /* mozilla_Utf8_h */
|