summaryrefslogtreecommitdiffstats
path: root/src/line_buffer.hh
diff options
context:
space:
mode:
Diffstat (limited to 'src/line_buffer.hh')
-rw-r--r--src/line_buffer.hh370
1 files changed, 370 insertions, 0 deletions
diff --git a/src/line_buffer.hh b/src/line_buffer.hh
new file mode 100644
index 0000000..ecd9e6c
--- /dev/null
+++ b/src/line_buffer.hh
@@ -0,0 +1,370 @@
+/**
+ * Copyright (c) 2007-2012, Timothy Stack
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * * Neither the name of Timothy Stack nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @file line_buffer.hh
+ */
+
+#ifndef line_buffer_hh
+#define line_buffer_hh
+
+#include <array>
+#include <exception>
+#include <future>
+#include <vector>
+
+#include <errno.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <zlib.h>
+
+#include "base/auto_fd.hh"
+#include "base/auto_mem.hh"
+#include "base/file_range.hh"
+#include "base/lnav_log.hh"
+#include "base/result.h"
+#include "safe/safe.h"
+#include "shared_buffer.hh"
+
+struct line_info {
+ file_range li_file_range;
+ bool li_partial{false};
+ bool li_valid_utf{true};
+ bool li_has_ansi{false};
+};
+
+/**
+ * Buffer for reading whole lines out of file descriptors. The class presents
+ * a stateless interface, callers specify the offset where a line starts and
+ * the class takes care of caching the surrounding range and locating the
+ * delimiter.
+ *
+ * XXX A bit of a wheel reinvention, but I'm not sure how well the libraries
+ * handle non-blocking I/O...
+ */
+class line_buffer {
+public:
+ static const ssize_t DEFAULT_LINE_BUFFER_SIZE;
+ static const ssize_t MAX_LINE_BUFFER_SIZE;
+ class error : public std::exception {
+ public:
+ explicit error(int err) : e_err(err) {}
+
+ int e_err;
+ };
+
+ struct header_data {
+ timeval hd_mtime{};
+ auto_buffer hd_extra{auto_buffer::alloc(0)};
+ std::string hd_name;
+ std::string hd_comment;
+
+ bool empty() const
+ {
+ return this->hd_mtime.tv_sec == 0 && this->hd_extra.empty()
+ && this->hd_name.empty() && this->hd_comment.empty();
+ }
+ };
+
+#define GZ_WINSIZE 32768U /*> gzip's max supported dictionary is 15-bits */
+#define GZ_RAW_MODE (-15) /*> Raw inflate data mode */
+#define GZ_HEADER_MODE (15 + 32) /*> Automatic zstd or gzip decoding */
+#define GZ_BORROW_BITS_MASK 7 /*> Bits (0-7) consumed in previous block */
+#define GZ_END_OF_BLOCK_MASK 128 /*> Stopped because reached end-of-block */
+#define GZ_END_OF_FILE_MASK 64 /*> Stopped because reached end-of-file */
+
+ /**
+ * A memoized gzip file reader that can do random file access faster than
+ * gzseek/gzread alone.
+ */
+ class gz_indexed {
+ public:
+ gz_indexed();
+ gz_indexed(gz_indexed&& other) = default;
+ ~gz_indexed() { this->close(); }
+
+ inline operator bool() const { return this->gz_fd != -1; }
+
+ uLong get_source_offset() const
+ {
+ return !!*this ? this->strm.total_in + this->strm.avail_in : 0;
+ }
+
+ void close();
+ void init_stream();
+ void continue_stream();
+ void open(int fd, header_data& hd);
+ int stream_data(void* buf, size_t size);
+ void seek(off_t offset);
+
+ /**
+ * Decompress bytes from the gz file returning at most `size` bytes.
+ * offset is the byte-offset in the decompressed data stream.
+ */
+ int read(void* buf, size_t offset, size_t size);
+
+ struct indexDict {
+ off_t in = 0;
+ off_t out = 0;
+ unsigned char bits = 0;
+ unsigned char in_bits = 0;
+ Bytef index[GZ_WINSIZE];
+ indexDict(z_stream const& s, const file_size_t size);
+
+ int apply(z_streamp s);
+ };
+
+ private:
+ z_stream strm; /*< gzip streams structure */
+ std::vector<indexDict>
+ syncpoints; /*< indexed dictionaries as discovered */
+ auto_mem<Bytef> inbuf; /*< Compressed data buffer */
+ int gz_fd = -1; /*< The file to read data from. */
+ };
+
+ /** Construct an empty line_buffer. */
+ line_buffer();
+
+ line_buffer(line_buffer&& other) = delete;
+
+ virtual ~line_buffer();
+
+ /** @param fd The file descriptor that data should be pulled from. */
+ void set_fd(auto_fd& fd);
+
+ /** @return The file descriptor that data should be pulled from. */
+ int get_fd() const { return this->lb_fd; }
+
+ time_t get_file_time() const { return this->lb_file_time; }
+
+ /**
+ * @return The size of the file or the amount of data pulled from a pipe.
+ */
+ file_ssize_t get_file_size() const { return this->lb_file_size; }
+
+ bool is_pipe() const { return !this->lb_seekable; }
+
+ bool is_pipe_closed() const
+ {
+ return !this->lb_seekable && (this->lb_file_size != -1);
+ }
+
+ bool is_compressed() const { return this->lb_compressed; }
+
+ file_off_t get_read_offset(file_off_t off) const
+ {
+ if (this->is_compressed()) {
+ return this->lb_compressed_offset;
+ }
+ return off;
+ }
+
+ bool is_data_available(file_off_t off, file_off_t stat_size) const
+ {
+ if (this->is_compressed()) {
+ return (this->lb_file_size == -1 || off < this->lb_file_size);
+ }
+ return off < stat_size;
+ }
+
+ /**
+ * Attempt to load the next line into the buffer.
+ *
+ * @param prev_line The range of the previous line.
+ * @return If the read was successful, information about the line.
+ * Otherwise, an error message.
+ */
+ Result<line_info, std::string> load_next_line(file_range prev_line = {});
+
+ Result<shared_buffer_ref, std::string> read_range(file_range fr);
+
+ file_range get_available();
+
+ bool is_likely_to_flush(file_range prev_line);
+
+ void flush_at(file_off_t off)
+ {
+ if (this->in_range(off)) {
+ this->lb_buffer.resize(off - this->lb_file_offset);
+ } else {
+ this->lb_buffer.clear();
+ }
+ }
+
+ /** Release any resources held by this object. */
+ void reset()
+ {
+ this->lb_fd.reset();
+
+ this->lb_file_offset = 0;
+ this->lb_file_size = (ssize_t) -1;
+ this->lb_buffer.resize(0);
+ this->lb_last_line_offset = -1;
+ }
+
+ /** Check the invariants for this object. */
+ bool invariant() const
+ {
+ require(this->lb_buffer.size() <= this->lb_buffer.capacity());
+
+ return true;
+ }
+
+ void quiesce();
+
+ struct stats {
+ bool empty() const
+ {
+ return this->s_decompressions == 0 && this->s_preads == 0
+ && this->s_requested_preloads == 0
+ && this->s_used_preloads == 0;
+ }
+
+ uint32_t s_decompressions{0};
+ uint32_t s_preads{0};
+ uint32_t s_requested_preloads{0};
+ uint32_t s_used_preloads{0};
+ std::array<uint32_t, 10> s_hist{};
+ };
+
+ struct stats consume_stats() { return std::exchange(this->lb_stats, {}); }
+
+ size_t get_buffer_size() const { return this->lb_buffer.size(); }
+
+ const header_data& get_header_data() const { return this->lb_header; }
+
+ void enable_cache();
+
+ static void cleanup_cache();
+
+private:
+ /**
+ * @param off The file offset to check for in the buffer.
+ * @return True if the given offset is cached in the buffer.
+ */
+ bool in_range(file_off_t off) const
+ {
+ return this->lb_file_offset <= off
+ && off
+ < (this->lb_file_offset + (file_ssize_t) this->lb_buffer.size());
+ }
+
+ void resize_buffer(size_t new_max);
+
+ /**
+ * Ensure there is enough room in the buffer to cache a range of data from
+ * the file. First, this method will check to see if there is enough room
+ * from where 'start' begins in the buffer to the maximum buffer size. If
+ * this is not enough, the currently cached data at 'start' will be moved
+ * to the beginning of the buffer, overwriting any cached data earlier in
+ * the file. Finally, if this is still not enough, the buffer will be
+ * reallocated to make more room.
+ *
+ * @param start The file offset of the start of the line.
+ * @param max_length The amount of data to be cached in the buffer.
+ */
+ void ensure_available(file_off_t start, ssize_t max_length);
+
+ /**
+ * Fill the buffer with the given range of data from the file.
+ *
+ * @param start The file offset where data should start to be read from the
+ * file.
+ * @param max_length The maximum amount of data to read from the file.
+ * @return True if any data was read from the file.
+ */
+ bool fill_range(file_off_t start, ssize_t max_length);
+
+ /**
+ * After a successful fill, the cached data can be retrieved with this
+ * method.
+ *
+ * @param start The file offset to retrieve cached data for.
+ * @param avail_out On return, the amount of data currently cached at the
+ * given offset.
+ * @return A pointer to the start of the cached data in the internal
+ * buffer.
+ */
+ const char* get_range(file_off_t start, file_ssize_t& avail_out) const
+ {
+ size_t buffer_offset = start - this->lb_file_offset;
+
+ require(buffer_offset >= 0);
+ require(this->lb_buffer.size() >= buffer_offset);
+
+ const auto* retval = this->lb_buffer.at(buffer_offset);
+ avail_out = this->lb_buffer.size() - buffer_offset;
+
+ return retval;
+ }
+
+ bool load_next_buffer();
+
+ using safe_gz_indexed = safe::Safe<gz_indexed>;
+
+ shared_buffer lb_share_manager;
+
+ auto_fd lb_fd; /*< The file to read data from. */
+ safe_gz_indexed lb_gz_file; /*< File reader for gzipped files. */
+ bool lb_bz_file{false}; /*< Flag set for bzip2 compressed files. */
+
+ auto_buffer lb_buffer{auto_buffer::alloc(DEFAULT_LINE_BUFFER_SIZE)};
+ nonstd::optional<auto_buffer> lb_alt_buffer;
+ std::vector<uint32_t> lb_alt_line_starts;
+ std::vector<bool> lb_alt_line_is_utf;
+ std::vector<bool> lb_alt_line_has_ansi;
+ std::future<bool> lb_loader_future;
+ nonstd::optional<file_off_t> lb_loader_file_offset;
+
+ file_off_t lb_compressed_offset{
+ 0}; /*< The offset into the compressed file. */
+ file_ssize_t lb_file_size{
+ -1}; /*<
+ * The size of the file. When lb_fd refers to
+ * a pipe, this is set to the amount of data
+ * read from the pipe when EOF is reached.
+ */
+ file_off_t lb_file_offset{0}; /*<
+ * Data cached in the buffer comes from this
+ * offset in the file.
+ */
+ time_t lb_file_time{0};
+ bool lb_seekable{false}; /*< Flag set for seekable file descriptors. */
+ bool lb_compressed{false};
+ file_off_t lb_last_line_offset{-1}; /*< */
+
+ std::vector<uint32_t> lb_line_starts;
+ std::vector<bool> lb_line_is_utf;
+ std::vector<bool> lb_line_has_ansi;
+ stats lb_stats;
+
+ nonstd::optional<auto_fd> lb_cached_fd;
+
+ header_data lb_header;
+};
+
+#endif