diff options
Diffstat (limited to '')
-rw-r--r-- | src/lib/util/csv_file.cc | 557 |
1 files changed, 557 insertions, 0 deletions
diff --git a/src/lib/util/csv_file.cc b/src/lib/util/csv_file.cc new file mode 100644 index 0000000..f402038 --- /dev/null +++ b/src/lib/util/csv_file.cc @@ -0,0 +1,557 @@ +// Copyright (C) 2014-2021 Internet Systems Consortium, Inc. ("ISC") +// +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include <config.h> +#include <util/csv_file.h> + +#include <algorithm> +#include <iostream> +#include <fstream> +#include <sstream> +#include <iomanip> + +namespace isc { +namespace util { + +CSVRow::CSVRow(const size_t cols, const char separator) + : separator_(1, separator), values_(cols) { +} + +CSVRow::CSVRow(const std::string& text, const char separator) + : separator_(1, separator) { + // Parsing is exception safe, so this will not throw. + parse(text); +} + +void +CSVRow::parse(const std::string& line) { + size_t sep_pos = 0; + size_t prev_pos = 0; + size_t len = 0; + + // In case someone is reusing the row. + values_.clear(); + + // Iterate over line, splitting on separators. + while (prev_pos < line.size()) { + // Find the next separator. + sep_pos = line.find_first_of(separator_, prev_pos); + if (sep_pos == std::string::npos) { + break; + } + + // Extract the value for the previous column. + len = sep_pos - prev_pos; + values_.push_back(line.substr(prev_pos, len)); + + // Move past the separator. + prev_pos = sep_pos + 1; + }; + + // Extract the last column. + len = line.size() - prev_pos; + values_.push_back(line.substr(prev_pos, len)); +} + +std::string +CSVRow::readAt(const size_t at) const { + checkIndex(at); + return (values_[at]); +} + +std::string +CSVRow::readAtEscaped(const size_t at) const { + return (unescapeCharacters(readAt(at))); +} + +std::string +CSVRow::render() const { + std::ostringstream s; + for (size_t i = 0; i < values_.size(); ++i) { + // Do not put separator before the first value. + if (i > 0) { + s << separator_; + } + s << values_[i]; + } + return (s.str()); +} + +void +CSVRow::writeAt(const size_t at, const char* value) { + checkIndex(at); + values_[at] = value; +} + +void +CSVRow::writeAtEscaped(const size_t at, const std::string& value) { + writeAt(at, escapeCharacters(value, separator_)); +} + +void +CSVRow::trim(const size_t count) { + checkIndex(count); + values_.resize(values_.size() - count); +} + +std::ostream& operator<<(std::ostream& os, const CSVRow& row) { + os << row.render(); + return (os); +} + +void +CSVRow::checkIndex(const size_t at) const { + if (at >= values_.size()) { + isc_throw(CSVFileError, "value index '" << at << "' of the CSV row" + " is out of bounds; maximal index is '" + << (values_.size() - 1) << "'"); + } +} + +CSVFile::CSVFile(const std::string& filename) + : filename_(filename), fs_(), cols_(0), read_msg_() { +} + +CSVFile::~CSVFile() { + close(); +} + +void +CSVFile::close() { + // It is allowed to close multiple times. If file has been already closed, + // this is no-op. + if (fs_) { + fs_->close(); + fs_.reset(); + } +} + +bool +CSVFile::exists() const { + std::ifstream fs(filename_.c_str()); + const bool file_exists = fs.good(); + fs.close(); + return (file_exists); +} + +void +CSVFile::flush() const { + checkStreamStatusAndReset("flush"); + fs_->flush(); +} + +void +CSVFile::addColumn(const std::string& col_name) { + // It is not allowed to add a new column when file is open. + if (fs_) { + isc_throw(CSVFileError, "attempt to add a column '" << col_name + << "' while the file '" << getFilename() + << "' is open"); + } + addColumnInternal(col_name); +} + +void +CSVFile::addColumnInternal(const std::string& col_name) { + if (std::find(cols_.begin(), cols_.end(), col_name) != cols_.end()) { + isc_throw(CSVFileError, "attempt to add duplicate column '" + << col_name << "'"); + } + cols_.push_back(col_name); +} + +void +CSVFile::append(const CSVRow& row) const { + checkStreamStatusAndReset("append"); + + if (row.getValuesCount() != getColumnCount()) { + isc_throw(CSVFileError, "number of values in the CSV row '" + << row.getValuesCount() << "' doesn't match the number of" + " columns in the CSV file '" << getColumnCount() << "'"); + } + + /// @todo Apparently, seekp and seekg are interchangeable. A call to seekp + /// results in moving the input pointer too. This is ok for now. It means + /// that when the append() is called, the read pointer is moved to the EOF. + /// For the current use cases we only read a file and then append a new + /// content. If we come up with the scenarios when read and write is + /// needed at the same time, we may revisit this: perhaps remember the + /// old pointer. Also, for safety, we call both functions so as we are + /// sure that both pointers are moved. + fs_->seekp(0, std::ios_base::end); + fs_->seekg(0, std::ios_base::end); + fs_->clear(); + + std::string text = row.render(); + *fs_ << text << std::endl; + if (!fs_->good()) { + fs_->clear(); + isc_throw(CSVFileError, "failed to write CSV row '" + << text << "' to the file '" << filename_ << "'"); + } +} + +void +CSVFile::checkStreamStatusAndReset(const std::string& operation) const { + if (!fs_) { + isc_throw(CSVFileError, "NULL stream pointer when performing '" + << operation << "' on file '" << filename_ << "'"); + + } else if (!fs_->is_open()) { + fs_->clear(); + isc_throw(CSVFileError, "closed stream when performing '" + << operation << "' on file '" << filename_ << "'"); + + } else { + fs_->clear(); + } +} + +std::streampos +CSVFile::size() const { + std::ifstream fs(filename_.c_str()); + bool ok = fs.good(); + // If something goes wrong, including that the file doesn't exist, + // return 0. + if (!ok) { + fs.close(); + return (0); + } + std::ifstream::pos_type pos; + try { + // Seek to the end of file and see where we are. This is a size of + // the file. + fs.seekg(0, std::ifstream::end); + pos = fs.tellg(); + fs.close(); + } catch (const std::exception&) { + return (0); + } + return (pos); +} + +size_t +CSVFile::getColumnIndex(const std::string& col_name) const { + for (size_t i = 0; i < cols_.size(); ++i) { + if (cols_[i] == col_name) { + return (i); + } + } + isc_throw(isc::OutOfRange, "column '" << col_name << "' doesn't exist"); +} + +std::string +CSVFile::getColumnName(const size_t col_index) const { + if (col_index >= cols_.size()) { + isc_throw(isc::OutOfRange, "column index " << col_index << " in the " + " CSV file '" << filename_ << "' is out of range; the CSV" + " file has only " << cols_.size() << " columns "); + } + return (cols_[col_index]); +} + +bool +CSVFile::next(CSVRow& row, const bool skip_validation) { + // Set something as row validation error. Although, we haven't started + // actual row validation we should get rid of any previously recorded + // errors so as the caller doesn't interpret them as the current one. + setReadMsg("validation not started"); + + try { + // Check that stream is "ready" for any IO operations. + checkStreamStatusAndReset("get next row"); + + } catch (const isc::Exception& ex) { + setReadMsg(ex.what()); + return (false); + } + + // Get the next non-blank line from the file. + std::string line; + while (fs_->good() && line.empty()) { + std::getline(*fs_, line); + } + + // If we didn't read anything... + if (line.empty()) { + // If we reached the end of file, return an empty row to signal EOF. + if (fs_->eof()) { + row = EMPTY_ROW(); + return (true); + + } else if (!fs_->good()) { + // If we hit an IO error, communicate it to the caller but do NOT close + // the stream. Caller may try again. + setReadMsg("error reading a row from CSV file '" + + std::string(filename_) + "'"); + return (false); + } + } + + // Parse the line. + row.parse(line); + + // And check if it is correct. + return (skip_validation ? true : validate(row)); +} + +void +CSVFile::open(const bool seek_to_end) { + // If file doesn't exist or is empty, we have to create our own file. + if (size() == static_cast<std::streampos>(0)) { + recreate(); + + } else { + // Try to open existing file, holding some data. + fs_.reset(new std::fstream(filename_.c_str())); + + // Catch exceptions so as we can close the file if error occurs. + try { + // The file may fail to open. For example, because of insufficient + // permissions. Although the file is not open we should call close + // to reset our internal pointer. + if (!fs_->is_open()) { + isc_throw(CSVFileError, "unable to open '" << filename_ << "'"); + } + // Make sure we are on the beginning of the file, so as we + // can parse the header. + fs_->seekg(0); + if (!fs_->good()) { + isc_throw(CSVFileError, "unable to set read pointer in the file '" + << filename_ << "'"); + } + + // Read the header. + CSVRow header; + if (!next(header, true)) { + isc_throw(CSVFileError, "failed to read and parse header of the" + " CSV file '" << filename_ << "': " + << getReadMsg()); + } + + // Check the header against the columns specified for the CSV file. + if (!validateHeader(header)) { + isc_throw(CSVFileError, "invalid header '" << header + << "' in CSV file '" << filename_ << "': " + << getReadMsg()); + } + + // Everything is good, so if we haven't added any columns yet, + // add them. + if (getColumnCount() == 0) { + for (size_t i = 0; i < header.getValuesCount(); ++i) { + addColumnInternal(header.readAt(i)); + } + } + + // If caller requested that the pointer is set at the end of file, + // move both read and write pointer. + if (seek_to_end) { + fs_->seekp(0, std::ios_base::end); + fs_->seekg(0, std::ios_base::end); + if (!fs_->good()) { + isc_throw(CSVFileError, "unable to move to the end of" + " CSV file '" << filename_ << "'"); + } + fs_->clear(); + } + + } catch (const std::exception&) { + close(); + throw; + } + } +} + +void +CSVFile::recreate() { + // There is no sense creating a file if we don't specify columns for it. + if (getColumnCount() == 0) { + close(); + isc_throw(CSVFileError, "no columns defined for the newly" + " created CSV file '" << filename_ << "'"); + } + + // Close any dangling files. + close(); + fs_.reset(new std::fstream(filename_.c_str(), std::fstream::out)); + if (!fs_->is_open()) { + close(); + isc_throw(CSVFileError, "unable to open '" << filename_ << "'"); + } + // Opened successfully. Write a header to it. + try { + CSVRow header(getColumnCount()); + for (size_t i = 0; i < getColumnCount(); ++i) { + header.writeAt(i, getColumnName(i)); + } + *fs_ << header << std::endl; + + } catch (const std::exception& ex) { + close(); + isc_throw(CSVFileError, ex.what()); + } + +} + +bool +CSVFile::validate(const CSVRow& row) { + setReadMsg("success"); + bool ok = (row.getValuesCount() == getColumnCount()); + if (!ok) { + std::ostringstream s; + s << "the size of the row '" << row << "' doesn't match the number of" + " columns '" << getColumnCount() << "' of the CSV file '" + << filename_ << "'"; + setReadMsg(s.str()); + } + return (ok); +} + +bool +CSVFile::validateHeader(const CSVRow& header) { + if (getColumnCount() == 0) { + return (true); + } + + if (getColumnCount() != header.getValuesCount()) { + return (false); + } + + for (size_t i = 0; i < getColumnCount(); ++i) { + if (getColumnName(i) != header.readAt(i)) { + return (false); + } + } + return (true); +} + +const std::string CSVRow::escape_tag("&#x"); + +std::string +CSVRow::escapeCharacters(const std::string& orig_str, const std::string& characters) { + size_t char_pos = 0; + size_t prev_pos = 0; + + // We add the first character of the escape tag to the list of + // characters to escape. This ensures input which happens to + // be valid escape sequences will be escaped. + std::string escape_chars(characters + escape_tag[0]); + + // Check for a first occurrence. If none, just return a + // copy of the original. + char_pos = orig_str.find_first_of(escape_chars, prev_pos); + if (char_pos == std::string::npos) { + return(orig_str); + } + + std::stringstream ss; + while (char_pos < orig_str.size()) { + // Copy everything upto the character to escape. + ss << orig_str.substr(prev_pos, char_pos - prev_pos); + + // Copy the escape tag followed by the hex digits of the character. + ss << escape_tag << std::hex << std::setw(2) + << static_cast<uint16_t>(orig_str[char_pos]); + + ++char_pos; + prev_pos = char_pos; + + // Find the next character to escape. + char_pos = orig_str.find_first_of(escape_chars, prev_pos); + + // If no more, copy the remainder of the string. + if (char_pos == std::string::npos) { + ss << orig_str.substr(prev_pos, char_pos - prev_pos); + break; + } + + }; + + // Return the escaped string. + return(ss.str()); +} + +std::string +CSVRow::unescapeCharacters(const std::string& escaped_str) { + size_t esc_pos = 0; + size_t start_pos = 0; + + // Look for the escape tag. + esc_pos = escaped_str.find(escape_tag, start_pos); + if (esc_pos == std::string::npos) { + // No escape tags at all, we're done. + return(escaped_str); + } + + // We have at least one escape tag. + std::stringstream ss; + while (esc_pos < escaped_str.size()) { + // Save everything up to the tag. + ss << escaped_str.substr(start_pos, esc_pos - start_pos); + + // Now we need to see if we have valid hex digits + // following the tag. + unsigned int escaped_char = 0; + bool converted = true; + size_t dig_pos = esc_pos + escape_tag.size(); + if (dig_pos <= escaped_str.size() - 2) { + for (int i = 0; i < 2; ++i) { + uint8_t digit = escaped_str[dig_pos]; + + if (digit >= 'a' && digit <= 'f') { + digit = digit - 'a' + 10; + } else if (digit >= 'A' && digit <= 'F') { + digit = digit - 'A' + 10; + } else if (digit >= '0' && digit <= '9') { + digit -= '0'; + } else { + converted = false; + break; + } + + if (i == 0) { + escaped_char = digit << 4; + } else { + escaped_char |= digit; + } + + ++dig_pos; + } + } + + // If we converted an escaped character, add it. + if (converted) { + ss << static_cast<unsigned char>(escaped_char); + esc_pos = dig_pos; + } else { + // Apparently the escape_tag was not followed by two valid hex + // digits. We'll assume it just happens to be in the string, so + // we'll include it in the output. + ss << escape_tag; + esc_pos += escape_tag.size(); + } + + // Set the new start of search. + start_pos = esc_pos; + + // Look for the next escape tag. + esc_pos = escaped_str.find(escape_tag, start_pos); + + // If we're at the end we're done. + if (esc_pos == std::string::npos) { + // Make sure we grab the remnant. + ss << escaped_str.substr(start_pos, esc_pos - start_pos); + break; + } + }; + + return(ss.str()); +} + + +} // end of isc::util namespace +} // end of isc namespace |