summaryrefslogtreecommitdiffstats
path: root/parser/htmlparser/nsExpatDriver.h
blob: 78f9caf343078dfa4b86428742b68b49d2118778 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#ifndef NS_EXPAT_DRIVER__
#define NS_EXPAT_DRIVER__

#include "expat_config.h"
#include "expat.h"
#include "nsCOMPtr.h"
#include "nsString.h"
#include "nsIDTD.h"
#include "nsIInputStream.h"
#include "nsIParser.h"
#include "nsCycleCollectionParticipant.h"

#include "rlbox_expat.h"
#include "nsRLBoxExpatDriver.h"
#include "mozilla/UniquePtr.h"

class nsIExpatSink;
struct nsCatalogData;
class RLBoxExpatSandboxData;
namespace mozilla {
template <typename, size_t>
class Array;
}

class nsExpatDriver : public nsIDTD {
  virtual ~nsExpatDriver();

 public:
  NS_DECL_CYCLE_COLLECTING_ISUPPORTS_FINAL
  NS_DECL_NSIDTD
  NS_DECL_CYCLE_COLLECTION_CLASS(nsExpatDriver)

  nsExpatDriver();

  nsresult Initialize(nsIURI* aURI, nsIContentSink* aSink);

  nsresult ResumeParse(nsScanner& aScanner, bool aIsFinalChunk);

  int HandleExternalEntityRef(const char16_t* aOpenEntityNames,
                              const char16_t* aBase, const char16_t* aSystemId,
                              const char16_t* aPublicId);
  static void HandleStartElement(rlbox_sandbox_expat& aSandbox,
                                 tainted_expat<void*> aUserData,
                                 tainted_expat<const char16_t*> aName,
                                 tainted_expat<const char16_t**> aAtts);
  static void HandleStartElementForSystemPrincipal(
      rlbox_sandbox_expat& aSandbox, tainted_expat<void*> aUserData,
      tainted_expat<const char16_t*> aName,
      tainted_expat<const char16_t**> aAtts);
  static void HandleEndElement(rlbox_sandbox_expat& aSandbox,
                               tainted_expat<void*> aUserData,
                               tainted_expat<const char16_t*> aName);
  static void HandleEndElementForSystemPrincipal(
      rlbox_sandbox_expat& aSandbox, tainted_expat<void*> aUserData,
      tainted_expat<const char16_t*> aName);
  nsresult HandleCharacterData(const char16_t* aCData, const uint32_t aLength);
  nsresult HandleComment(const char16_t* aName);
  nsresult HandleProcessingInstruction(const char16_t* aTarget,
                                       const char16_t* aData);
  nsresult HandleXMLDeclaration(const char16_t* aVersion,
                                const char16_t* aEncoding, int32_t aStandalone);
  nsresult HandleDefault(const char16_t* aData, const uint32_t aLength);
  nsresult HandleStartCdataSection();
  nsresult HandleEndCdataSection();
  nsresult HandleStartDoctypeDecl(const char16_t* aDoctypeName,
                                  const char16_t* aSysid,
                                  const char16_t* aPubid,
                                  bool aHasInternalSubset);
  nsresult HandleEndDoctypeDecl();

 private:
  // Load up an external stream to get external entity information
  nsresult OpenInputStreamFromExternalDTD(const char16_t* aFPIStr,
                                          const char16_t* aURLStr,
                                          nsIURI* aBaseURI,
                                          nsIInputStream** aStream,
                                          nsIURI** aAbsURI);

  enum class ChunkOrBufferIsFinal {
    None,
    FinalChunk,
    FinalChunkAndBuffer,
  };

  /**
   * Pass a buffer to Expat. If Expat is blocked aBuffer should be null and
   * aLength should be 0. The result of the call will be stored in
   * mInternalState. Expat will parse as much of the buffer as it can and store
   * the rest in its internal buffer.
   *
   * @param aBuffer the buffer to pass to Expat. May be null.
   * @param aLength the length of the buffer to pass to Expat (in number of
   *                char16_t's). Must be 0 if aBuffer is null and > 0 if
   *                aBuffer is not null.
   * @param aIsFinal whether this is the last chunk in a row passed to
   *                 ParseChunk, and if so whether it's the last chunk and
   *                 buffer passed to ParseChunk (meaning there will be no more
   *                 calls to ParseChunk for the document being parsed).
   * @param aConsumed [out] the number of PRUnichars that Expat consumed. This
   *                        doesn't include the PRUnichars that Expat stored in
   *                        its buffer but didn't parse yet.
   * @param aLastLineLength [out] the length of the last line that Expat has
   *                              consumed. This will only be computed if
   *                              aIsFinal is not None or mInternalState is set
   *                              to a failure.
   */
  void ParseChunk(const char16_t* aBuffer, uint32_t aLength,
                  ChunkOrBufferIsFinal aIsFinal, uint32_t* aConsumed,
                  XML_Size* aLastLineLength);
  /**
   * Wrapper for ParseBuffer. If the buffer is too large to be copied into the
   * sandbox all at once, splits it into chunks and invokes ParseBuffer in a
   * loop.
   *
   * @param aBuffer the buffer to pass to Expat. May be null.
   * @param aLength the length of the buffer to pass to Expat (in number of
   *                char16_t's). Must be 0 if aBuffer is null and > 0 if
   *                aBuffer is not null.
   * @param aIsFinal whether there will definitely not be any more new buffers
   *                 passed in to ParseBuffer
   * @param aConsumed [out] the number of PRUnichars that Expat consumed. This
   *                        doesn't include the PRUnichars that Expat stored in
   *                        its buffer but didn't parse yet.
   * @param aLastLineLength [out] the length of the last line that Expat has
   *                              consumed.
   */
  void ChunkAndParseBuffer(const char16_t* aBuffer, uint32_t aLength,
                           bool aIsFinal, uint32_t* aPassedToExpat,
                           uint32_t* aConsumed, XML_Size* aLastLineLength);

  nsresult HandleError();

  void MaybeStopParser(nsresult aState);

  bool BlockedOrInterrupted() {
    return mInternalState == NS_ERROR_HTMLPARSER_BLOCK ||
           mInternalState == NS_ERROR_HTMLPARSER_INTERRUPTED;
  }

  // Expat allows us to set the base URI for entities. It doesn't use the base
  // URI itself, but just passes it along to all the entity handlers (just the
  // external entity reference handler for us). It does expect the base URI as a
  // null-terminated string, with the same character type as the parsed buffers
  // (char16_t in our case). Because nsIURI stores a UTF-8 string we have to do
  // a conversion to UTF-16 for Expat. We also RLBox the Expat parser, so we
  // also do 2 copies (into RLBox sandbox, and Expat does a copy into its pool).
  // Most of the time this base URI is unused (the external entity handler is
  // rarely called), but when it is we also convert it back to a nsIURI, so we
  // convert the string back to UTF-8.
  //
  // We'd rather not do any of these conversions and copies, so we use a (hacky)
  // workaround. We store all base URIs in an array of nsIURIs. Instead of
  // passing the real URI to Expat as a string, we pass it a null-terminated
  // 2-character buffer. The first character of that buffer stores the index of
  // the corresponding nsIURI in the array (incremented with 1 because 0 is used
  // to terminate a string). The entity handler can then use the index from the
  // base URI that Expat passes it to look up the right nsIURI from the array.
  //
  // GetExpatBaseURI pushes the nsIURI to the array, and creates the
  // two-character buffer for it.
  //
  // GetBaseURI looks up the right nsIURI in the array, based on the index from
  // the two-character buffer.
  using ExpatBaseURI = mozilla::Array<XML_Char, 2>;
  ExpatBaseURI GetExpatBaseURI(nsIURI* aURI);
  nsIURI* GetBaseURI(const XML_Char* aBase) const;

  RLBoxExpatSandboxData* SandboxData() const;
  rlbox_sandbox_expat* Sandbox() const;

  // Destroy expat parser and return sandbox to pool
  void Destroy();

  mozilla::UniquePtr<mozilla::RLBoxSandboxPoolData> mSandboxPoolData;
  tainted_expat<XML_Parser> mExpatParser;

  nsString mLastLine;
  nsString mCDataText;
  // Various parts of a doctype
  nsString mDoctypeName;
  nsString mSystemID;
  nsString mPublicID;
  nsString mInternalSubset;
  bool mInCData;
  bool mInInternalSubset;
  bool mInExternalDTD;
  bool mMadeFinalCallToExpat;

  // Used to track if we're in the parser.
  bool mInParser;

  nsresult mInternalState;

  // The length of the data in Expat's buffer (in number of PRUnichars).
  uint32_t mExpatBuffered;

  uint16_t mTagDepth;

  // These sinks all refer the same conceptual object. mOriginalSink is
  // identical with the nsIContentSink* passed to WillBuildModel, and exists
  // only to avoid QI-ing back to nsIContentSink*.
  nsCOMPtr<nsIContentSink> mOriginalSink;
  nsCOMPtr<nsIExpatSink> mSink;

  const nsCatalogData* mCatalogData;  // weak
  nsTArray<nsCOMPtr<nsIURI>> mURIs;

  // Used for error reporting.
  uint64_t mInnerWindowID;
};

class RLBoxExpatSandboxData : public mozilla::RLBoxSandboxDataBase {
  friend class RLBoxExpatSandboxPool;
  friend class nsExpatDriver;

 public:
  explicit RLBoxExpatSandboxData(uint64_t aSize)
      : mozilla::RLBoxSandboxDataBase(aSize) {
    MOZ_COUNT_CTOR(RLBoxExpatSandboxData);
  }
  ~RLBoxExpatSandboxData();
  rlbox_sandbox_expat* Sandbox() const { return mSandbox.get(); }
  // After getting a sandbox from the pool we need to register the
  // Handle{Start,End}Element callbacks and associate the driver with the
  // sandbox.
  void AttachDriver(bool IsSystemPrincipal, void* aDriver);
  void DetachDriver();

 private:
  mozilla::UniquePtr<rlbox_sandbox_expat> mSandbox;
  // Common expat callbacks that persist across calls to {Attach,Detach}Driver,
  // and consequently across sandbox reuses.
  sandbox_callback_expat<XML_XmlDeclHandler> mHandleXMLDeclaration;
  sandbox_callback_expat<XML_CharacterDataHandler> mHandleCharacterData;
  sandbox_callback_expat<XML_ProcessingInstructionHandler>
      mHandleProcessingInstruction;
  sandbox_callback_expat<XML_DefaultHandler> mHandleDefault;
  sandbox_callback_expat<XML_ExternalEntityRefHandler> mHandleExternalEntityRef;
  sandbox_callback_expat<XML_CommentHandler> mHandleComment;
  sandbox_callback_expat<XML_StartCdataSectionHandler> mHandleStartCdataSection;
  sandbox_callback_expat<XML_EndCdataSectionHandler> mHandleEndCdataSection;
  sandbox_callback_expat<XML_StartDoctypeDeclHandler> mHandleStartDoctypeDecl;
  sandbox_callback_expat<XML_EndDoctypeDeclHandler> mHandleEndDoctypeDecl;
  // Expat callbacks specific to each driver, and thus (re)set across sandbox
  // reuses.
  sandbox_callback_expat<XML_StartElementHandler> mHandleStartElement;
  sandbox_callback_expat<XML_EndElementHandler> mHandleEndElement;
};

#endif