summaryrefslogtreecommitdiffstats
path: root/src/lib/util/csv_file.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/util/csv_file.cc')
-rw-r--r--src/lib/util/csv_file.cc557
1 files changed, 557 insertions, 0 deletions
diff --git a/src/lib/util/csv_file.cc b/src/lib/util/csv_file.cc
new file mode 100644
index 0000000..f402038
--- /dev/null
+++ b/src/lib/util/csv_file.cc
@@ -0,0 +1,557 @@
+// Copyright (C) 2014-2021 Internet Systems Consortium, Inc. ("ISC")
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <config.h>
+#include <util/csv_file.h>
+
+#include <algorithm>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <iomanip>
+
+namespace isc {
+namespace util {
+
+CSVRow::CSVRow(const size_t cols, const char separator)
+ : separator_(1, separator), values_(cols) {
+}
+
+CSVRow::CSVRow(const std::string& text, const char separator)
+ : separator_(1, separator) {
+ // Parsing is exception safe, so this will not throw.
+ parse(text);
+}
+
+void
+CSVRow::parse(const std::string& line) {
+ size_t sep_pos = 0;
+ size_t prev_pos = 0;
+ size_t len = 0;
+
+ // In case someone is reusing the row.
+ values_.clear();
+
+ // Iterate over line, splitting on separators.
+ while (prev_pos < line.size()) {
+ // Find the next separator.
+ sep_pos = line.find_first_of(separator_, prev_pos);
+ if (sep_pos == std::string::npos) {
+ break;
+ }
+
+ // Extract the value for the previous column.
+ len = sep_pos - prev_pos;
+ values_.push_back(line.substr(prev_pos, len));
+
+ // Move past the separator.
+ prev_pos = sep_pos + 1;
+ };
+
+ // Extract the last column.
+ len = line.size() - prev_pos;
+ values_.push_back(line.substr(prev_pos, len));
+}
+
+std::string
+CSVRow::readAt(const size_t at) const {
+ checkIndex(at);
+ return (values_[at]);
+}
+
+std::string
+CSVRow::readAtEscaped(const size_t at) const {
+ return (unescapeCharacters(readAt(at)));
+}
+
+std::string
+CSVRow::render() const {
+ std::ostringstream s;
+ for (size_t i = 0; i < values_.size(); ++i) {
+ // Do not put separator before the first value.
+ if (i > 0) {
+ s << separator_;
+ }
+ s << values_[i];
+ }
+ return (s.str());
+}
+
+void
+CSVRow::writeAt(const size_t at, const char* value) {
+ checkIndex(at);
+ values_[at] = value;
+}
+
+void
+CSVRow::writeAtEscaped(const size_t at, const std::string& value) {
+ writeAt(at, escapeCharacters(value, separator_));
+}
+
+void
+CSVRow::trim(const size_t count) {
+ checkIndex(count);
+ values_.resize(values_.size() - count);
+}
+
+std::ostream& operator<<(std::ostream& os, const CSVRow& row) {
+ os << row.render();
+ return (os);
+}
+
+void
+CSVRow::checkIndex(const size_t at) const {
+ if (at >= values_.size()) {
+ isc_throw(CSVFileError, "value index '" << at << "' of the CSV row"
+ " is out of bounds; maximal index is '"
+ << (values_.size() - 1) << "'");
+ }
+}
+
+CSVFile::CSVFile(const std::string& filename)
+ : filename_(filename), fs_(), cols_(0), read_msg_() {
+}
+
+CSVFile::~CSVFile() {
+ close();
+}
+
+void
+CSVFile::close() {
+ // It is allowed to close multiple times. If file has been already closed,
+ // this is no-op.
+ if (fs_) {
+ fs_->close();
+ fs_.reset();
+ }
+}
+
+bool
+CSVFile::exists() const {
+ std::ifstream fs(filename_.c_str());
+ const bool file_exists = fs.good();
+ fs.close();
+ return (file_exists);
+}
+
+void
+CSVFile::flush() const {
+ checkStreamStatusAndReset("flush");
+ fs_->flush();
+}
+
+void
+CSVFile::addColumn(const std::string& col_name) {
+ // It is not allowed to add a new column when file is open.
+ if (fs_) {
+ isc_throw(CSVFileError, "attempt to add a column '" << col_name
+ << "' while the file '" << getFilename()
+ << "' is open");
+ }
+ addColumnInternal(col_name);
+}
+
+void
+CSVFile::addColumnInternal(const std::string& col_name) {
+ if (std::find(cols_.begin(), cols_.end(), col_name) != cols_.end()) {
+ isc_throw(CSVFileError, "attempt to add duplicate column '"
+ << col_name << "'");
+ }
+ cols_.push_back(col_name);
+}
+
+void
+CSVFile::append(const CSVRow& row) const {
+ checkStreamStatusAndReset("append");
+
+ if (row.getValuesCount() != getColumnCount()) {
+ isc_throw(CSVFileError, "number of values in the CSV row '"
+ << row.getValuesCount() << "' doesn't match the number of"
+ " columns in the CSV file '" << getColumnCount() << "'");
+ }
+
+ /// @todo Apparently, seekp and seekg are interchangeable. A call to seekp
+ /// results in moving the input pointer too. This is ok for now. It means
+ /// that when the append() is called, the read pointer is moved to the EOF.
+ /// For the current use cases we only read a file and then append a new
+ /// content. If we come up with the scenarios when read and write is
+ /// needed at the same time, we may revisit this: perhaps remember the
+ /// old pointer. Also, for safety, we call both functions so as we are
+ /// sure that both pointers are moved.
+ fs_->seekp(0, std::ios_base::end);
+ fs_->seekg(0, std::ios_base::end);
+ fs_->clear();
+
+ std::string text = row.render();
+ *fs_ << text << std::endl;
+ if (!fs_->good()) {
+ fs_->clear();
+ isc_throw(CSVFileError, "failed to write CSV row '"
+ << text << "' to the file '" << filename_ << "'");
+ }
+}
+
+void
+CSVFile::checkStreamStatusAndReset(const std::string& operation) const {
+ if (!fs_) {
+ isc_throw(CSVFileError, "NULL stream pointer when performing '"
+ << operation << "' on file '" << filename_ << "'");
+
+ } else if (!fs_->is_open()) {
+ fs_->clear();
+ isc_throw(CSVFileError, "closed stream when performing '"
+ << operation << "' on file '" << filename_ << "'");
+
+ } else {
+ fs_->clear();
+ }
+}
+
+std::streampos
+CSVFile::size() const {
+ std::ifstream fs(filename_.c_str());
+ bool ok = fs.good();
+ // If something goes wrong, including that the file doesn't exist,
+ // return 0.
+ if (!ok) {
+ fs.close();
+ return (0);
+ }
+ std::ifstream::pos_type pos;
+ try {
+ // Seek to the end of file and see where we are. This is a size of
+ // the file.
+ fs.seekg(0, std::ifstream::end);
+ pos = fs.tellg();
+ fs.close();
+ } catch (const std::exception&) {
+ return (0);
+ }
+ return (pos);
+}
+
+size_t
+CSVFile::getColumnIndex(const std::string& col_name) const {
+ for (size_t i = 0; i < cols_.size(); ++i) {
+ if (cols_[i] == col_name) {
+ return (i);
+ }
+ }
+ isc_throw(isc::OutOfRange, "column '" << col_name << "' doesn't exist");
+}
+
+std::string
+CSVFile::getColumnName(const size_t col_index) const {
+ if (col_index >= cols_.size()) {
+ isc_throw(isc::OutOfRange, "column index " << col_index << " in the "
+ " CSV file '" << filename_ << "' is out of range; the CSV"
+ " file has only " << cols_.size() << " columns ");
+ }
+ return (cols_[col_index]);
+}
+
+bool
+CSVFile::next(CSVRow& row, const bool skip_validation) {
+ // Set something as row validation error. Although, we haven't started
+ // actual row validation we should get rid of any previously recorded
+ // errors so as the caller doesn't interpret them as the current one.
+ setReadMsg("validation not started");
+
+ try {
+ // Check that stream is "ready" for any IO operations.
+ checkStreamStatusAndReset("get next row");
+
+ } catch (const isc::Exception& ex) {
+ setReadMsg(ex.what());
+ return (false);
+ }
+
+ // Get the next non-blank line from the file.
+ std::string line;
+ while (fs_->good() && line.empty()) {
+ std::getline(*fs_, line);
+ }
+
+ // If we didn't read anything...
+ if (line.empty()) {
+ // If we reached the end of file, return an empty row to signal EOF.
+ if (fs_->eof()) {
+ row = EMPTY_ROW();
+ return (true);
+
+ } else if (!fs_->good()) {
+ // If we hit an IO error, communicate it to the caller but do NOT close
+ // the stream. Caller may try again.
+ setReadMsg("error reading a row from CSV file '"
+ + std::string(filename_) + "'");
+ return (false);
+ }
+ }
+
+ // Parse the line.
+ row.parse(line);
+
+ // And check if it is correct.
+ return (skip_validation ? true : validate(row));
+}
+
+void
+CSVFile::open(const bool seek_to_end) {
+ // If file doesn't exist or is empty, we have to create our own file.
+ if (size() == static_cast<std::streampos>(0)) {
+ recreate();
+
+ } else {
+ // Try to open existing file, holding some data.
+ fs_.reset(new std::fstream(filename_.c_str()));
+
+ // Catch exceptions so as we can close the file if error occurs.
+ try {
+ // The file may fail to open. For example, because of insufficient
+ // permissions. Although the file is not open we should call close
+ // to reset our internal pointer.
+ if (!fs_->is_open()) {
+ isc_throw(CSVFileError, "unable to open '" << filename_ << "'");
+ }
+ // Make sure we are on the beginning of the file, so as we
+ // can parse the header.
+ fs_->seekg(0);
+ if (!fs_->good()) {
+ isc_throw(CSVFileError, "unable to set read pointer in the file '"
+ << filename_ << "'");
+ }
+
+ // Read the header.
+ CSVRow header;
+ if (!next(header, true)) {
+ isc_throw(CSVFileError, "failed to read and parse header of the"
+ " CSV file '" << filename_ << "': "
+ << getReadMsg());
+ }
+
+ // Check the header against the columns specified for the CSV file.
+ if (!validateHeader(header)) {
+ isc_throw(CSVFileError, "invalid header '" << header
+ << "' in CSV file '" << filename_ << "': "
+ << getReadMsg());
+ }
+
+ // Everything is good, so if we haven't added any columns yet,
+ // add them.
+ if (getColumnCount() == 0) {
+ for (size_t i = 0; i < header.getValuesCount(); ++i) {
+ addColumnInternal(header.readAt(i));
+ }
+ }
+
+ // If caller requested that the pointer is set at the end of file,
+ // move both read and write pointer.
+ if (seek_to_end) {
+ fs_->seekp(0, std::ios_base::end);
+ fs_->seekg(0, std::ios_base::end);
+ if (!fs_->good()) {
+ isc_throw(CSVFileError, "unable to move to the end of"
+ " CSV file '" << filename_ << "'");
+ }
+ fs_->clear();
+ }
+
+ } catch (const std::exception&) {
+ close();
+ throw;
+ }
+ }
+}
+
+void
+CSVFile::recreate() {
+ // There is no sense creating a file if we don't specify columns for it.
+ if (getColumnCount() == 0) {
+ close();
+ isc_throw(CSVFileError, "no columns defined for the newly"
+ " created CSV file '" << filename_ << "'");
+ }
+
+ // Close any dangling files.
+ close();
+ fs_.reset(new std::fstream(filename_.c_str(), std::fstream::out));
+ if (!fs_->is_open()) {
+ close();
+ isc_throw(CSVFileError, "unable to open '" << filename_ << "'");
+ }
+ // Opened successfully. Write a header to it.
+ try {
+ CSVRow header(getColumnCount());
+ for (size_t i = 0; i < getColumnCount(); ++i) {
+ header.writeAt(i, getColumnName(i));
+ }
+ *fs_ << header << std::endl;
+
+ } catch (const std::exception& ex) {
+ close();
+ isc_throw(CSVFileError, ex.what());
+ }
+
+}
+
+bool
+CSVFile::validate(const CSVRow& row) {
+ setReadMsg("success");
+ bool ok = (row.getValuesCount() == getColumnCount());
+ if (!ok) {
+ std::ostringstream s;
+ s << "the size of the row '" << row << "' doesn't match the number of"
+ " columns '" << getColumnCount() << "' of the CSV file '"
+ << filename_ << "'";
+ setReadMsg(s.str());
+ }
+ return (ok);
+}
+
+bool
+CSVFile::validateHeader(const CSVRow& header) {
+ if (getColumnCount() == 0) {
+ return (true);
+ }
+
+ if (getColumnCount() != header.getValuesCount()) {
+ return (false);
+ }
+
+ for (size_t i = 0; i < getColumnCount(); ++i) {
+ if (getColumnName(i) != header.readAt(i)) {
+ return (false);
+ }
+ }
+ return (true);
+}
+
+const std::string CSVRow::escape_tag("&#x");
+
+std::string
+CSVRow::escapeCharacters(const std::string& orig_str, const std::string& characters) {
+ size_t char_pos = 0;
+ size_t prev_pos = 0;
+
+ // We add the first character of the escape tag to the list of
+ // characters to escape. This ensures input which happens to
+ // be valid escape sequences will be escaped.
+ std::string escape_chars(characters + escape_tag[0]);
+
+ // Check for a first occurrence. If none, just return a
+ // copy of the original.
+ char_pos = orig_str.find_first_of(escape_chars, prev_pos);
+ if (char_pos == std::string::npos) {
+ return(orig_str);
+ }
+
+ std::stringstream ss;
+ while (char_pos < orig_str.size()) {
+ // Copy everything upto the character to escape.
+ ss << orig_str.substr(prev_pos, char_pos - prev_pos);
+
+ // Copy the escape tag followed by the hex digits of the character.
+ ss << escape_tag << std::hex << std::setw(2)
+ << static_cast<uint16_t>(orig_str[char_pos]);
+
+ ++char_pos;
+ prev_pos = char_pos;
+
+ // Find the next character to escape.
+ char_pos = orig_str.find_first_of(escape_chars, prev_pos);
+
+ // If no more, copy the remainder of the string.
+ if (char_pos == std::string::npos) {
+ ss << orig_str.substr(prev_pos, char_pos - prev_pos);
+ break;
+ }
+
+ };
+
+ // Return the escaped string.
+ return(ss.str());
+}
+
+std::string
+CSVRow::unescapeCharacters(const std::string& escaped_str) {
+ size_t esc_pos = 0;
+ size_t start_pos = 0;
+
+ // Look for the escape tag.
+ esc_pos = escaped_str.find(escape_tag, start_pos);
+ if (esc_pos == std::string::npos) {
+ // No escape tags at all, we're done.
+ return(escaped_str);
+ }
+
+ // We have at least one escape tag.
+ std::stringstream ss;
+ while (esc_pos < escaped_str.size()) {
+ // Save everything up to the tag.
+ ss << escaped_str.substr(start_pos, esc_pos - start_pos);
+
+ // Now we need to see if we have valid hex digits
+ // following the tag.
+ unsigned int escaped_char = 0;
+ bool converted = true;
+ size_t dig_pos = esc_pos + escape_tag.size();
+ if (dig_pos <= escaped_str.size() - 2) {
+ for (int i = 0; i < 2; ++i) {
+ uint8_t digit = escaped_str[dig_pos];
+
+ if (digit >= 'a' && digit <= 'f') {
+ digit = digit - 'a' + 10;
+ } else if (digit >= 'A' && digit <= 'F') {
+ digit = digit - 'A' + 10;
+ } else if (digit >= '0' && digit <= '9') {
+ digit -= '0';
+ } else {
+ converted = false;
+ break;
+ }
+
+ if (i == 0) {
+ escaped_char = digit << 4;
+ } else {
+ escaped_char |= digit;
+ }
+
+ ++dig_pos;
+ }
+ }
+
+ // If we converted an escaped character, add it.
+ if (converted) {
+ ss << static_cast<unsigned char>(escaped_char);
+ esc_pos = dig_pos;
+ } else {
+ // Apparently the escape_tag was not followed by two valid hex
+ // digits. We'll assume it just happens to be in the string, so
+ // we'll include it in the output.
+ ss << escape_tag;
+ esc_pos += escape_tag.size();
+ }
+
+ // Set the new start of search.
+ start_pos = esc_pos;
+
+ // Look for the next escape tag.
+ esc_pos = escaped_str.find(escape_tag, start_pos);
+
+ // If we're at the end we're done.
+ if (esc_pos == std::string::npos) {
+ // Make sure we grab the remnant.
+ ss << escaped_str.substr(start_pos, esc_pos - start_pos);
+ break;
+ }
+ };
+
+ return(ss.str());
+}
+
+
+} // end of isc::util namespace
+} // end of isc namespace