comm/mailnews/base/src/LineReader.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef LineReader_h__
#define LineReader_h__

#include <algorithm>
#include "mozilla/Span.h"
#include "mozilla/Vector.h"

/**
 * FirstLine() returns the first line of a span.
 * The EOL sequence (CRLF or LF) is included in the returned line.
 * If no lines are found an empty span is returned.
 */
inline mozilla::Span<const char> FirstLine(
    mozilla::Span<const char> const& data) {
  auto eol = std::find(data.cbegin(), data.cend(), '\n');
  if (eol == data.cend()) {
    // no line ending found - return empty span.
    return data.First(0);
  }
  ++eol;
  return mozilla::Span<const char>(data.cbegin(), eol);
}

/**
 * LineReader breaks up continuous character streams into lines.
 * Data is fed in by calling Feed() as often as required, and a
 * callback function is invoked to handle each resulting line.
 *
 * The resulting lines include the end-of-line char(s), except for any
 * non-terminated final line.
 * LF ('\n') is used as the line terminator. CRLF-terminated lines will
 * be handled correctly - the resultant lines will include the line
 * terminators exactly as they appear in the input data.
 *
 * Goals for LineReader:
 * - Byte exact. The bytes fed in will appear _exactly_ in the callback fn.
 * - Callback can be inlined (due to templating).
 * - Avoid copying data if possible. The internal buffer is only used when
 *   lines are split across incoming chunks of data.
 * - Tries to avoid heap allocation. If the internal buffer is used, it'll
 *   only allocate memory for long lines (>80 chars).
 *
 * Example usage:
 *
 *    auto callback = [](mozilla::Span<const char> line) {
 *      printf("%s\n", nsCString(line).get());
 *      return true;
 *    };
 *
 *    LineReader c;
 *    c.Feed("Line 1\r\nLine 2\r\nLine 3", callback);
 *    // -> "Line 1\r\n"
 *    // -> "Line 2\r\n"
 *    c.Feed("\r\nLeftovers.", callback);
 *    // -> "Line 3\r\n"
 *    c.Flush(callback);
 *    // -> "Leftovers."
 *
 * See TestLineReader.cpp for more examples.
 */
class LineReader {
 public:
  /*
   * Feed() takes in a chunk of data to be split up into lines. You can call
   * this as often as required to feed in all your data. Don't forget to call
   * Flush() after the last Feed(), in case the last line has no line endings!
   *
   * The callback will be invoked once for each full line extracted.
   * It should have the form:
   * The callback is of the form:
   *   bool callback(mozilla::Span<const char> line);
   *
   * The data in `line` should be considered valid only until the callback
   * returns. So if the callback wants to retain data it needs to copy it.
   * `line` will include any EOL character(s).
   * The callback should return true to continue processing.
   * If the callback returns false, processing will stop, even if there is
   * more data available.
   */
  template <typename LineFn>
  void Feed(mozilla::Span<const char> data, LineFn callback) {
    bool keepGoing = true;
    while (!data.IsEmpty() && keepGoing) {
      auto eol = std::find(data.cbegin(), data.cend(), '\n');
      if (eol == data.cend()) {
        // No LF. Just collect and wait for more.
        // TODO: limit maximum mBuf size, to stop maliciously-crafted input
        // OOMing us?
        if (!mBuf.append(data.data(), data.size())) {
          NS_ERROR("OOM!");
        }
        return;
      }

      // Consume everything up to and including the LF.
      ++eol;
      mozilla::Span<const char> line(data.cbegin(), eol);
      data = mozilla::Span<const char>(eol, data.cend());

      if (mBuf.empty()) {
        // Pass the data through directly, no copying.
        keepGoing = callback(line);
      } else {
        // Complete the line we previously started.
        if (!mBuf.append(line.data(), line.size())) {
          NS_ERROR("OOM!");
        }
        keepGoing = callback(mBuf);
        mBuf.clear();
      }
    }
  }

  /*
   * Flush() will invoke the callback with any leftover data, after the last
   * Feed() call has completed.
   * The line passed to the callback will be a partial line, without a final
   * LF. If the input data has a final LF, there will be nothing to flush,
   * and the callback will not be invoked.
   */
  template <typename LineFn>
  void Flush(LineFn callback) {
    if (!mBuf.empty()) {
      callback(mBuf);
      mBuf.clear();
    }
  }

 private:
  // Growable buffer, to collect lines which come in as multiple parts.
  // Can handle lines up to 80 chars before needing to reallocate.
  mozilla::Vector<char, 80> mBuf;
};

/**
 * SplitLines() invokes a callback for every complete line it finds in the
 * input data.
 *
 * The callback is of the form:
 *   bool callback(mozilla::Span<const char> line);
 * where line is a span pointing to the range of bytes in the input data
 * which comprises the line.
 *
 * If the callback returns false, processing is halted.
 *
 * The lines passed to the callback include end-of-line (EOL) character(s).
 *
 * Lines are considered terminated by '\n' (LF) but this means CRLF-delimited
 * data is also handled correctly.
 *
 * This function is byte-exact: if you concatenate all the line spans, along
 * with the unconsumed data returned at the end, you'll end up with the exact
 * same byte sequence as the original input data.
 *
 * @param data - The input bytes.
 * @param callback - The callback to invoke for each line.
 *
 * @returns the unconsumed data. Usually this will be empty, or an incomplete
 *          line at the end (with no EOL). However if the callback returned
 *          false, all the unused data will be returned.
 */
template <typename LineFn>
mozilla::Span<const char> SplitLines(mozilla::Span<const char> data,
                                     LineFn callback) {
  while (!data.IsEmpty()) {
    auto eol = std::find(data.cbegin(), data.cend(), '\n');
    if (eol == data.cend()) {
      // No LF - we're done. May or may not be some leftover data.
      break;
    }

    // Consume everything up to and including the LF.
    ++eol;
    mozilla::Span<const char> line(data.cbegin(), eol);
    data = mozilla::Span<const char>(eol, data.cend());

    if (callback(line) == false) {
      break;
    }
  }
  return data;
}

#endif