comm/mailnews/base/src/HeaderReader.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef HeaderReader_h__
#define HeaderReader_h__

#include <algorithm>
#include "LineReader.h"
#include "nsMsgUtils.h"
#include "nsString.h"
#include "mozilla/Span.h"

/**
 * HeaderReader parses mail headers from a buffer.
 * The input is fed in via Parse(), and a callback function is invoked for
 * each header encountered.
 *
 * General goals:
 *
 * - Incremental. Parse() can be called multiple times as a buffer grows.
 * - Works in-place. Headers are returned as byte ranges within the data.
 * - Works with a partial header block (e.g. sniffing the first N bytes
 *   of a message file). It won't mistakenly emit an incomplete header.
 * - Track exact byte offsets for values, to support rewriting headers in
 *   place. This is needed to support X-Mozilla-Status et al.
 * - Avoids copying data where possible.
 * - Callback is inlined.
 * - Callback can halt processing (by returning false).
 * - Tolerant of real-world oddness in input data (for now, we just skip
 *   lines which don't make sense).
 *
 * Example usage:
 *    nsCString raw = "To: Alice\r\nFrom: Bob\r\n\r\n...Message body..."_ns;
 *    auto cb = [&](HeaderReader::Header const& hdr) {
 *      printf("-> '%s':'%s'\n", hdr.Name(raw), hdr.Value(raw));
 *      return true;
 *    };
 *
 *    HeaderReader rdr;
 *    rdr.Parse(raw, cb);
 *    // -> 'To':'Alice'
 *    // -> 'From':'Bob'
 *
 * See TestHeaderReader.cpp for more examples.
 */
class HeaderReader {
 public:
  /**
   * Parse() scans an input buffer and invokes a callback for each complete
   * header found.
   *
   * It can be called any number of times - it'll pick up where it left off.
   * The idea is that the caller can accumulate data in multiple chunks and
   * call Parse() to extract headers incrementally as they come in.
   * It does rely on data being a single contiguous allocation, but it
   * doesn't require the data being located in the same memory location
   * each time. So can it can be safely used on a growable buffer.
   *
   * Signature of callback is:
   * bool hdrCallback(HeaderReader::Hdr const& hdr);
   *
   * The callback should return true to continue parsing, or false to halt.
   * This allows, for example, an early-out if you're scanning for one
   * specific header and don't care about the rest.
   *
   * Parse() stops when one of these conditions is true:
   * 1. The end of the header block is reached (the final blank line marker
   *    is consumed). Subsequent calls to IsComplete() will return true.
   * 2. The callback returns false. If Parse() is called again, it will
   *    safely pick up where it left off.
   * 3. No more headers can be read. There may be some unconsumed data
   *    returned (eg a partial line). Parse() can be safely called again
   *    when more data becomes available. It will resume from the point it
   *    reached previously.
   *
   * It is safe to call Parse() on a truncated header block. It will only
   * invoke the callback for headers which are unambiguously complete.
   *
   * @param data - bytes containing the header block to parse.
   * @param hdrCallback - callback to invoke for each header found
   *
   * @returns a span containing the unconsumed (leftover) data.
   */
  template <typename HeaderFn>
  mozilla::Span<const char> Parse(mozilla::Span<const char> data,
                                  HeaderFn hdrCallback);

  /**
   * Complete() returns true if the header block has been fully parsed.
   * Further calls to Parse() will consume no more data.
   * The blank line which separates the header block from the body is consumed.
   */
  bool IsComplete() const { return mFinished; }

  /**
   * Hdr holds offsets to a name/value pair within a header block.
   * The name starts at pos.
   * The value starts at pos+rawValOffset.
   */
  struct Hdr {
    uint32_t pos{0};           // Start position of header within the block.
    uint32_t len{0};           // Length of entire header, including final EOL.
    uint32_t nameLen{0};       // Length of name.
    uint32_t rawValOffset{0};  // Where the value starts, relative to pos.
    uint32_t rawValLen{0};     // Excludes final EOL.
    bool IsEmpty() const { return len == 0; }

    /**
     * Access the header name as a string.
     *
     * @param data - the data originally passed into Parse().
     * @returns the name within data, wrapped for string access (so it is
     *          valid only as long as data is valid).
     */
    nsDependentCSubstring Name(mozilla::Span<const char> data) const {
      return nsDependentCSubstring(data.Elements() + pos, nameLen);
    }
    /**
     * Access the raw value as a string.
     *
     * @param data - the data originally passed into Parse().
     * @returns the raw data, EOLs and all, wrapped for string access (so it
     *          is valid only as long as data is valid).
     */
    nsDependentCSubstring RawValue(mozilla::Span<const char> data) const {
      return nsDependentCSubstring(data.Elements() + pos + rawValOffset,
                                   rawValLen);
    }
    /**
     * Decode the 'cooked' value into a string.
     * NOTE: handles unfolding multi-line values. No attempt (yet) at dealing
     * with comments or quoted strings...
     *
     * @param data - the data originally passed into Parse().
     * @returns a new string containing the value.
     */
    nsCString Value(mozilla::Span<const char> data) const {
      nsCString val(RawValue(data));
      val.ReplaceSubstring("\r\n"_ns, ""_ns);
      val.ReplaceSubstring("\n"_ns, ""_ns);
      return val;
    }

    /**
     * EOL() returns a string containing the eol characters at the end of the
     * header. It will be "\n" or "\r\n".
     * Calling this on an empty hdr struct is unsupported.
     */
    nsDependentCSubstring EOL(mozilla::Span<const char> data) const {
      MOZ_ASSERT(len >= 2);  // Empty or malformed?

      uint32_t i = pos + len;
      int n = 0;
      if (data[i - 1] == '\n') {
        ++n;
        if (data[i - 2] == '\r') {
          ++n;
        }
      }
      return nsDependentCSubstring(data.Elements() + pos + len - n, n);
    }
  };

 private:
  // How far Parse() has gone so far.
  uint32_t mPos{0};

  // The current header we're accumulating.
  Hdr mHdr;

  // Number of EOL chars at the end of previous line (so we can strip it if the
  // next line is folded).
  int mEOLSize{0};

  // Set when end of header block detected.
  bool mFinished{false};

  template <typename HeaderFn>
  bool HandleLine(mozilla::Span<const char> line, HeaderFn hdrCallback);
};

// Parse() implementation.
template <typename HeaderFn>
mozilla::Span<const char> HeaderReader::Parse(mozilla::Span<const char> data,
                                              HeaderFn hdrCallback) {
  // If were're resuming, skip what we've already scanned.
  auto remaining = mozilla::Span<const char>(data.cbegin() + mPos, data.cend());
  if (mFinished) {
    return remaining;
  }
  // Iterate over all the lines of our input.
  remaining = SplitLines(remaining,
                         [this, hdrCallback](mozilla::Span<const char> line) {
                           return HandleLine(line, hdrCallback);
                         });

  if (!mFinished) {
    // We didn't get to the end of the header block, but we may still be
    // able to finalise a previously-started header...
    if (!mHdr.IsEmpty()) {
      if (remaining.Length() > 0 && remaining[0] != ' ' &&
          remaining[0] != '\t') {
        // Next line isn't folded, so we know the header is complete.
        mHdr.rawValLen -= mEOLSize;
        hdrCallback(mHdr);
      } else {
        // Can't tell if header is complete. Rewind and try again next time.
        mPos = mHdr.pos;
        remaining =
            mozilla::Span<const char>(data.cbegin() + mPos, data.cend());
      }
      mHdr = Hdr();
    }
  }
  return remaining;
}

// Helper function - we call this on each complete line we encounter.
template <typename HeaderFn>
bool HeaderReader::HandleLine(mozilla::Span<const char> line,
                              HeaderFn hdrCallback) {
  // Should never be here if we've finished.
  MOZ_ASSERT(!mFinished);
  // we should _never_ see empty strings.
  MOZ_ASSERT(!line.IsEmpty());

  // Find the EOL sequence (CRLF or LF).
  auto eol = line.cend();
  auto p = eol;
  if (p > line.cbegin() && *(p - 1) == '\n') {
    --eol;
    if ((p - 1) > line.cbegin() && *(p - 2) == '\r') {
      --eol;
    }
  }
  // We should never have been called with a non-terminated line.
  MOZ_ASSERT(eol != line.cend());

  // Blank line indicates end of header block.
  if (eol == line.cbegin()) {
    if (!mHdr.IsEmpty()) {
      // Emit the completed header.
      mHdr.rawValLen -= mEOLSize;
      hdrCallback(mHdr);
      mHdr = Hdr();
    }
    mFinished = true;
    mPos += line.Length();
    return false;  // Stop.
  }

  // A folded line?
  // Leading space or tab indicates continuation of previous value.
  if (line[0] == ' ' || line[0] == '\t') {
    if (!mHdr.IsEmpty()) {
      // Grow the existing header.
      mHdr.len += line.Length();
      mHdr.rawValLen += line.Length();
      mEOLSize = line.cend() - eol;
    } else {
      // UHOH - a folded value but we haven't started a header...
      // Not much we can do, so we'll just ignore the line.
      NS_WARNING("Malformed header (bare continuation)");
    }
    mPos += line.Length();
    return true;  // Next line, please.
  }

  bool keepGoing = true;
  // By now, we're expecting a "name: value" line, to start a fresh header.
  if (!mHdr.IsEmpty()) {
    // Flush previous header now we know it's complete.
    mHdr.rawValLen -= mEOLSize;
    keepGoing = hdrCallback(mHdr);
    mHdr = Hdr();
  }

  auto colon = std::find(line.cbegin(), line.cend(), ':');
  if (colon == line.cend()) {
    // UHOH. We were expecting a "name: value" line, but didn't find one.
    // Just ignore this line.
    NS_WARNING("Malformed header (expected 'name: value')");
    mPos += line.Length();
    return keepGoing;
  }
  auto val = colon + 1;
  if (*val == ' ' || *val == '\t') {
    // Skip single leading whitespace.
    ++val;
  }

  // Start filling out the new header (it may grow if folded lines come next).
  mHdr.pos = mPos;
  mHdr.len = line.Length();
  mHdr.nameLen = colon - line.cbegin();

  mHdr.rawValOffset = val - line.cbegin();
  mHdr.rawValLen = line.cend() - val;
  mEOLSize = line.cend() - eol;
  mPos += line.Length();
  return keepGoing;
}

#endif