js/src/util/Text.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
 * vim: set ts=8 sts=2 et sw=2 tw=80:
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#ifndef util_Text_h
#define util_Text_h

#include "mozilla/ArrayUtils.h"
#include "mozilla/Assertions.h"
#include "mozilla/Attributes.h"
#include "mozilla/Casting.h"
#include "mozilla/Latin1.h"
#include "mozilla/Likely.h"
#include "mozilla/TextUtils.h"
#include "mozilla/Utf8.h"

#include <algorithm>
#include <stddef.h>
#include <stdint.h>
#include <string>
#include <type_traits>
#include <utility>

#include "NamespaceImports.h"

#include "js/Utility.h"
#include "util/Unicode.h"

namespace js {
class FrontendContext;
}  // namespace js

class JSLinearString;

template <typename CharT>
static constexpr MOZ_ALWAYS_INLINE size_t js_strlen(const CharT* s) {
  return std::char_traits<CharT>::length(s);
}

template <typename CharT>
extern const CharT* js_strchr_limit(const CharT* s, char16_t c,
                                    const CharT* limit);

template <typename CharT>
static MOZ_ALWAYS_INLINE size_t js_strnlen(const CharT* s, size_t maxlen) {
  for (size_t i = 0; i < maxlen; ++i) {
    if (s[i] == '\0') {
      return i;
    }
  }
  return maxlen;
}

namespace js {

class JS_PUBLIC_API GenericPrinter;

template <typename CharT>
constexpr uint8_t AsciiDigitToNumber(CharT c) {
  using UnsignedCharT = std::make_unsigned_t<CharT>;
  auto uc = static_cast<UnsignedCharT>(c);
  return uc - '0';
}

template <typename CharT>
static constexpr bool IsAsciiPrintable(CharT c) {
  using UnsignedCharT = std::make_unsigned_t<CharT>;
  auto uc = static_cast<UnsignedCharT>(c);
  return ' ' <= uc && uc <= '~';
}

template <typename Char1, typename Char2>
inline bool EqualChars(const Char1* s1, const Char2* s2, size_t len) {
  // Cast |JS::Latin1Char| to |char| to ensure compilers emit std::memcmp for
  // the comparison.
  if constexpr (std::is_same_v<Char1, char> &&
                std::is_same_v<Char2, JS::Latin1Char>) {
    return mozilla::ArrayEqual(s1, reinterpret_cast<const char*>(s2), len);
  } else if constexpr (std::is_same_v<Char1, JS::Latin1Char> &&
                       std::is_same_v<Char2, char>) {
    return mozilla::ArrayEqual(reinterpret_cast<const char*>(s1), s2, len);
  } else {
    return mozilla::ArrayEqual(s1, s2, len);
  }
}

// Return less than, equal to, or greater than zero depending on whether
// s1 is less than, equal to, or greater than s2.
template <typename Char1, typename Char2>
inline int32_t CompareChars(const Char1* s1, size_t len1, const Char2* s2,
                            size_t len2) {
  size_t n = std::min(len1, len2);
  for (size_t i = 0; i < n; i++) {
    if (int32_t cmp = s1[i] - s2[i]) {
      return cmp;
    }
  }

  return int32_t(len1 - len2);
}

// Return s advanced past any Unicode white space characters.
template <typename CharT>
static inline const CharT* SkipSpace(const CharT* s, const CharT* end) {
  MOZ_ASSERT(s <= end);

  while (s < end && unicode::IsSpace(*s)) {
    s++;
  }

  return s;
}

extern UniqueChars DuplicateStringToArena(arena_id_t destArenaId, JSContext* cx,
                                          const char* s);

extern UniqueChars DuplicateStringToArena(arena_id_t destArenaId, JSContext* cx,
                                          const char* s, size_t n);

extern UniqueLatin1Chars DuplicateStringToArena(arena_id_t destArenaId,
                                                JSContext* cx,
                                                const Latin1Char* s, size_t n);

extern UniqueTwoByteChars DuplicateStringToArena(arena_id_t destArenaId,
                                                 JSContext* cx,
                                                 const char16_t* s);

extern UniqueTwoByteChars DuplicateStringToArena(arena_id_t destArenaId,
                                                 JSContext* cx,
                                                 const char16_t* s, size_t n);

/*
 * These variants do not report OOMs, you must arrange for OOMs to be reported
 * yourself.
 */
extern UniqueChars DuplicateStringToArena(arena_id_t destArenaId,
                                          const char* s);

extern UniqueChars DuplicateStringToArena(arena_id_t destArenaId, const char* s,
                                          size_t n);

extern UniqueLatin1Chars DuplicateStringToArena(arena_id_t destArenaId,
                                                const JS::Latin1Char* s,
                                                size_t n);

extern UniqueTwoByteChars DuplicateStringToArena(arena_id_t destArenaId,
                                                 const char16_t* s);

extern UniqueTwoByteChars DuplicateStringToArena(arena_id_t destArenaId,
                                                 const char16_t* s, size_t n);

extern UniqueChars DuplicateString(JSContext* cx, const char* s);
extern UniqueChars DuplicateString(FrontendContext* fc, const char* s);

extern UniqueChars DuplicateString(JSContext* cx, const char* s, size_t n);

extern UniqueLatin1Chars DuplicateString(JSContext* cx, const JS::Latin1Char* s,
                                         size_t n);

extern UniqueTwoByteChars DuplicateString(JSContext* cx, const char16_t* s);
extern UniqueTwoByteChars DuplicateString(FrontendContext* fc,
                                          const char16_t* s);

extern UniqueTwoByteChars DuplicateString(JSContext* cx, const char16_t* s,
                                          size_t n);

/*
 * These variants do not report OOMs, you must arrange for OOMs to be reported
 * yourself.
 */
extern UniqueChars DuplicateString(const char* s);

extern UniqueChars DuplicateString(const char* s, size_t n);

extern UniqueLatin1Chars DuplicateString(const JS::Latin1Char* s, size_t n);

extern UniqueTwoByteChars DuplicateString(const char16_t* s);

extern UniqueTwoByteChars DuplicateString(const char16_t* s, size_t n);

/*
 * Inflate bytes in ASCII encoding to char16_t code units. Return null on error,
 * otherwise return the char16_t buffer that was malloc'ed. A null char is
 * appended.
 */
extern char16_t* InflateString(JSContext* cx, const char* bytes, size_t length);

/**
 * For a valid UTF-8, Latin-1, or WTF-16 code unit sequence, expose its contents
 * as the sequence of WTF-16 |char16_t| code units that would identically
 * constitute it.
 */
template <typename CharT>
class InflatedChar16Sequence {
 private:
  const CharT* units_;
  const CharT* limit_;

  static_assert(std::is_same_v<CharT, char16_t> ||
                    std::is_same_v<CharT, JS::Latin1Char>,
                "InflatedChar16Sequence only supports UTF-8/Latin-1/WTF-16");

 public:
  InflatedChar16Sequence(const CharT* units, size_t len)
      : units_(units), limit_(units_ + len) {}

  bool hasMore() { return units_ < limit_; }

  char16_t next() {
    MOZ_ASSERT(hasMore());
    return static_cast<char16_t>(*units_++);
  }

  HashNumber computeHash() const {
    auto copy = *this;
    HashNumber hash = 0;
    while (copy.hasMore()) {
      hash = mozilla::AddToHash(hash, copy.next());
    }
    return hash;
  }
};

template <>
class InflatedChar16Sequence<mozilla::Utf8Unit> {
 private:
  const mozilla::Utf8Unit* units_;
  const mozilla::Utf8Unit* limit_;

  char16_t pendingTrailingSurrogate_ = 0;

 public:
  InflatedChar16Sequence(const mozilla::Utf8Unit* units, size_t len)
      : units_(units), limit_(units + len) {}

  bool hasMore() { return pendingTrailingSurrogate_ || units_ < limit_; }

  char16_t next() {
    MOZ_ASSERT(hasMore());

    if (MOZ_UNLIKELY(pendingTrailingSurrogate_)) {
      char16_t trail = 0;
      std::swap(pendingTrailingSurrogate_, trail);
      return trail;
    }

    mozilla::Utf8Unit unit = *units_++;
    if (mozilla::IsAscii(unit)) {
      return static_cast<char16_t>(unit.toUint8());
    }

    mozilla::Maybe<char32_t> cp =
        mozilla::DecodeOneUtf8CodePoint(unit, &units_, limit_);
    MOZ_ASSERT(cp.isSome(), "input code unit sequence required to be valid");

    char32_t v = cp.value();
    if (v < unicode::NonBMPMin) {
      return mozilla::AssertedCast<char16_t>(v);
    }

    char16_t lead;
    unicode::UTF16Encode(v, &lead, &pendingTrailingSurrogate_);

    MOZ_ASSERT(unicode::IsLeadSurrogate(lead));

    MOZ_ASSERT(pendingTrailingSurrogate_ != 0,
               "pendingTrailingSurrogate_ must be nonzero to be detected and "
               "returned next go-around");
    MOZ_ASSERT(unicode::IsTrailSurrogate(pendingTrailingSurrogate_));

    return lead;
  }

  HashNumber computeHash() const {
    auto copy = *this;
    HashNumber hash = 0;
    while (copy.hasMore()) {
      hash = mozilla::AddToHash(hash, copy.next());
    }
    return hash;
  }
};

/*
 * Inflate bytes to JS chars in an existing buffer. 'dst' must be large
 * enough for 'srclen' char16_t code units. The buffer is NOT null-terminated.
 */
inline void CopyAndInflateChars(char16_t* dst, const char* src, size_t srclen) {
  mozilla::ConvertLatin1toUtf16(mozilla::Span(src, srclen),
                                mozilla::Span(dst, srclen));
}

inline void CopyAndInflateChars(char16_t* dst, const JS::Latin1Char* src,
                                size_t srclen) {
  mozilla::ConvertLatin1toUtf16(mozilla::AsChars(mozilla::Span(src, srclen)),
                                mozilla::Span(dst, srclen));
}

/*
 * Convert one UCS-4 char and write it into a UTF-8 buffer, which must be at
 * least 4 bytes long.  Return the number of UTF-8 bytes of data written.
 */
extern uint32_t OneUcs4ToUtf8Char(uint8_t* utf8Buffer, char32_t ucs4Char);

extern size_t PutEscapedStringImpl(char* buffer, size_t size,
                                   GenericPrinter* out, JSLinearString* str,
                                   uint32_t quote);

template <typename CharT>
extern size_t PutEscapedStringImpl(char* buffer, size_t bufferSize,
                                   GenericPrinter* out, const CharT* chars,
                                   size_t length, uint32_t quote);

/*
 * Write str into buffer escaping any non-printable or non-ASCII character
 * using \escapes for JS string literals.
 * Guarantees that a NUL is at the end of the buffer unless size is 0. Returns
 * the length of the written output, NOT including the NUL. Thus, a return
 * value of size or more means that the output was truncated. If buffer
 * is null, just returns the length of the output. If quote is not 0, it must
 * be a single or double quote character that will quote the output.
 */
inline size_t PutEscapedString(char* buffer, size_t size, JSLinearString* str,
                               uint32_t quote) {
  size_t n = PutEscapedStringImpl(buffer, size, nullptr, str, quote);

  /* PutEscapedStringImpl can only fail with a file. */
  MOZ_ASSERT(n != size_t(-1));
  return n;
}

template <typename CharT>
inline size_t PutEscapedString(char* buffer, size_t bufferSize,
                               const CharT* chars, size_t length,
                               uint32_t quote) {
  size_t n =
      PutEscapedStringImpl(buffer, bufferSize, nullptr, chars, length, quote);

  /* PutEscapedStringImpl can only fail with a file. */
  MOZ_ASSERT(n != size_t(-1));
  return n;
}

inline bool EscapedStringPrinter(GenericPrinter& out, JSLinearString* str,
                                 uint32_t quote) {
  return PutEscapedStringImpl(nullptr, 0, &out, str, quote) != size_t(-1);
}

JSString* EncodeURI(JSContext* cx, const char* chars, size_t length);

// Return true if input string contains a given flag in a comma separated list.
bool ContainsFlag(const char* str, const char* flag);

namespace unicode {

/** Compute the number of code points in the valid UTF-8 range [begin, end). */
extern size_t CountCodePoints(const mozilla::Utf8Unit* begin,
                              const mozilla::Utf8Unit* end);

/**
 * Count the number of code points in [begin, end).
 *
 * Unlike the UTF-8 case above, consistent with legacy ECMAScript practice,
 * every sequence of 16-bit units is considered valid.  Lone surrogates are
 * treated as if they represented a code point of the same value.
 */
extern size_t CountCodePoints(const char16_t* begin, const char16_t* end);

}  // namespace unicode

}  // namespace js

#endif  // util_Text_h