summaryrefslogtreecommitdiffstats
path: root/js/public/SourceText.h
blob: 118c871095c156917b9f14a0b9823443923536cc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/*
 * SourceText encapsulates a count of char16_t (UTF-16) or Utf8Unit (UTF-8)
 * code units (note: code *units*, not bytes or code points) and those code
 * units ("source units").  (Latin-1 is not supported: all places where Latin-1
 * must be compiled first convert to a supported encoding.)
 *
 * A SourceText either observes without owning, or takes ownership of, source
 * units passed to |SourceText::init|.  Thus SourceText can be used to
 * efficiently avoid copying.
 *
 * Rules for use:
 *
 *  1) The passed-in source units must be allocated with js_malloc(),
 *     js_calloc(), or js_realloc() if |SourceText::init| is instructed to take
 *     ownership of the source units.
 *  2) If |SourceText::init| merely borrows the source units, the user must
 *     keep them alive until associated JS compilation is complete.
 *  3) Code that calls |SourceText::take{Chars,Units}()| must keep the source
 *     units alive until JS compilation completes.  Normally only the JS engine
 *     should call |SourceText::take{Chars,Units}()|.
 *  4) Use the appropriate SourceText parameterization depending on the source
 *     units encoding.
 *
 * Example use:
 *
 *    size_t length = 512;
 *    char16_t* chars = js_pod_malloc<char16_t>(length);
 *    if (!chars) {
 *        JS_ReportOutOfMemory(cx);
 *        return false;
 *    }
 *    JS::SourceText<char16_t> srcBuf;
 *    if (!srcBuf.init(cx, chars, length, JS::SourceOwnership::TakeOwnership)) {
 *        return false;
 *    }
 *    JS::Rooted<JSScript*> script(cx);
 *    if (!JS::Compile(cx, options, srcBuf, &script)) {
 *        return false;
 *    }
 */

#ifndef js_SourceText_h
#define js_SourceText_h

#include "mozilla/Assertions.h"  // MOZ_ASSERT
#include "mozilla/Attributes.h"  // MOZ_COLD, MOZ_IS_CLASS_INIT, MOZ_MUST_USE
#include "mozilla/Likely.h"      // MOZ_UNLIKELY
#include "mozilla/Utf8.h"        // mozilla::Utf8Unit

#include <stddef.h>     // size_t
#include <stdint.h>     // UINT32_MAX
#include <type_traits>  // std::conditional_t, std::is_same_v

#include "js/UniquePtr.h"  // js::UniquePtr
#include "js/Utility.h"    // JS::FreePolicy

namespace JS {

namespace detail {

MOZ_COLD extern JS_PUBLIC_API void ReportSourceTooLong(JSContext* cx);

}  // namespace detail

enum class SourceOwnership {
  Borrowed,
  TakeOwnership,
};

template <typename Unit>
class SourceText final {
 private:
  static_assert(std::is_same_v<Unit, mozilla::Utf8Unit> ||
                    std::is_same_v<Unit, char16_t>,
                "Unit must be either char16_t or Utf8Unit for "
                "SourceText<Unit>");

  /** |char16_t| or |Utf8Unit| source units of uncertain validity. */
  const Unit* units_ = nullptr;

  /** The length in code units of |units_|. */
  uint32_t length_ = 0;

  /**
   * Whether this owns |units_| or merely observes source units owned by some
   * other object.
   */
  bool ownsUnits_ = false;

 public:
  // A C++ character type that can represent the source units -- suitable for
  // passing to C++ string functions.
  using CharT =
      std::conditional_t<std::is_same_v<Unit, char16_t>, char16_t, char>;

 public:
  /**
   * Construct a SourceText.  It must be initialized using |init()| before it
   * can be used as compilation source text.
   */
  SourceText() = default;

  /**
   * Construct a SourceText from contents extracted from |other|.  This
   * SourceText will then act exactly as |other| would have acted, had it
   * not been passed to this function.  |other| will return to its default-
   * constructed state and must have |init()| called on it to use it.
   */
  SourceText(SourceText&& other)
      : units_(other.units_),
        length_(other.length_),
        ownsUnits_(other.ownsUnits_) {
    other.units_ = nullptr;
    other.length_ = 0;
    other.ownsUnits_ = false;
  }

  ~SourceText() {
    if (ownsUnits_) {
      js_free(const_cast<Unit*>(units_));
    }
  }

  /**
   * Initialize this with source unit data: |char16_t| for UTF-16 source
   * units, or |Utf8Unit| for UTF-8 source units.
   *
   * If |ownership == TakeOwnership|, *this function* takes ownership of
   * |units|, *even if* this function fails, and you MUST NOT free |units|
   * yourself.  This single-owner-friendly approach reduces risk of leaks on
   * failure.
   *
   * |units| may be null if |unitsLength == 0|; if so, this will silently be
   * initialized using non-null, unowned units.
   */
  MOZ_IS_CLASS_INIT MOZ_MUST_USE bool init(JSContext* cx, const Unit* units,
                                           size_t unitsLength,
                                           SourceOwnership ownership) {
    MOZ_ASSERT_IF(units == nullptr, unitsLength == 0);

    // Ideally we'd use |Unit| and not cast below, but the risk of a static
    // initializer is too great.
    static const CharT emptyString[] = {'\0'};

    // Initialize all fields *before* checking length.  This ensures that
    // if |ownership == SourceOwnership::TakeOwnership|, |units| will be
    // freed when |this|'s destructor is called.
    if (units) {
      units_ = units;
      length_ = static_cast<uint32_t>(unitsLength);
      ownsUnits_ = ownership == SourceOwnership::TakeOwnership;
    } else {
      units_ = reinterpret_cast<const Unit*>(emptyString);
      length_ = 0;
      ownsUnits_ = false;
    }

    // IMPLEMENTATION DETAIL, DO NOT RELY ON: This limit is used so we can
    // store offsets in |JSScript|s as |uint32_t|.  It could be lifted
    // fairly easily if desired, as the compiler uses |size_t| internally.
    if (MOZ_UNLIKELY(unitsLength > UINT32_MAX)) {
      detail::ReportSourceTooLong(cx);
      return false;
    }

    return true;
  }

  /**
   * Exactly identical to the |init()| overload above that accepts
   * |const Unit*|, but instead takes character data: |const CharT*|.
   *
   * (We can't just write this to accept |const CharT*|, because then in the
   * UTF-16 case this overload and the one above would be identical.  So we
   * use SFINAE to expose the |CharT| overload only if it's different.)
   */
  template <typename Char,
            typename = std::enable_if_t<std::is_same_v<Char, CharT> &&
                                        !std::is_same_v<Char, Unit>>>
  MOZ_IS_CLASS_INIT MOZ_MUST_USE bool init(JSContext* cx, const Char* chars,
                                           size_t charsLength,
                                           SourceOwnership ownership) {
    return init(cx, reinterpret_cast<const Unit*>(chars), charsLength,
                ownership);
  }

  /**
   * Initialize this using source units transferred out of |data|.
   */
  MOZ_MUST_USE bool init(JSContext* cx,
                         js::UniquePtr<Unit[], JS::FreePolicy> data,
                         size_t dataLength) {
    return init(cx, data.release(), dataLength, SourceOwnership::TakeOwnership);
  }

  /**
   * Exactly identical to the |init()| overload above that accepts
   * |UniquePtr<Unit[], JS::FreePolicy>|, but instead takes character data:
   * |UniquePtr<CharT[], JS::FreePolicy>|.
   *
   * (We can't just duplicate the signature above with s/Unit/CharT/, because
   * then in the UTF-16 case this overload and the one above would be identical.
   * So we use SFINAE to expose the |CharT| overload only if it's different.)
   */
  template <typename Char,
            typename = std::enable_if_t<std::is_same_v<Char, CharT> &&
                                        !std::is_same_v<Char, Unit>>>
  MOZ_MUST_USE bool init(JSContext* cx,
                         js::UniquePtr<Char[], JS::FreePolicy> data,
                         size_t dataLength) {
    return init(cx, data.release(), dataLength, SourceOwnership::TakeOwnership);
  }

  /**
   * Access the encapsulated data using a code unit type.
   *
   * This function is useful for code that wants to interact with source text
   * as *code units*, not as string data.  This doesn't matter for UTF-16,
   * but it's a crucial distinction for UTF-8.  When UTF-8 source text is
   * encapsulated, |Unit| being |mozilla::Utf8Unit| unambiguously indicates
   * that the code units are UTF-8.  In contrast |const char*| returned by
   * |get()| below could hold UTF-8 (or its ASCII subset) or Latin-1 or (in
   * particularly cursed embeddings) EBCDIC or some other legacy character
   * set.  Prefer this function to |get()| wherever possible.
   */
  const Unit* units() const { return units_; }

  /**
   * Access the encapsulated data using a character type.
   *
   * This function is useful for interactions with character-centric actions
   * like interacting with UniqueChars/UniqueTwoByteChars or printing out
   * text in a debugger, that only work with |CharT|.  But as |CharT| loses
   * encoding specificity when UTF-8 source text is encapsulated, prefer
   * |units()| to this function.
   */
  const CharT* get() const { return reinterpret_cast<const CharT*>(units_); }

  /**
   * Returns true if this owns the source units and will free them on
   * destruction.  If true, it is legal to call |take{Chars,Units}()|.
   */
  bool ownsUnits() const { return ownsUnits_; }

  /**
   * Count of the underlying source units -- code units, not bytes or code
   * points -- in this.
   */
  uint32_t length() const { return length_; }

  /**
   * Retrieve and take ownership of the underlying source units.  The caller
   * is now responsible for calling js_free() on the returned value, *but
   * only after JS script compilation has completed*.
   *
   * After underlying source units have been taken, this will continue to
   * refer to the same data -- it just won't own the data.  get() and
   * length() will return the same values, but ownsUnits() will be false.
   * The taken source units must be kept alive until after JS script
   * compilation completes, as noted above, for this to be safe.
   *
   * The caller must check ownsUnits() before calling takeUnits().  Taking
   * and then free'ing an unowned buffer will have dire consequences.
   */
  Unit* takeUnits() {
    MOZ_ASSERT(ownsUnits_);
    ownsUnits_ = false;
    return const_cast<Unit*>(units_);
  }

  /**
   * Akin to |takeUnits()| in all respects, but returns characters rather
   * than units.
   */
  CharT* takeChars() { return reinterpret_cast<CharT*>(takeUnits()); }

 private:
  SourceText(const SourceText&) = delete;
  void operator=(const SourceText&) = delete;
};

}  // namespace JS

#endif /* js_SourceText_h */