// Copyright (C) 2015,2017 Internet Systems Consortium, Inc. ("ISC") // // This Source Code Form is subject to the terms of the Mozilla Public // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. #ifndef VERSIONED_CSV_FILE_H #define VERSIONED_CSV_FILE_H #include <util/csv_file.h> namespace isc { namespace util { /// @brief Exception thrown when an error occurs during CSV file processing. class VersionedCSVFileError : public Exception { public: VersionedCSVFileError(const char* file, size_t line, const char* what) : isc::Exception(file, line, what) { }; }; /// @brief Contains the metadata for a single column in a file. class VersionedColumn { public: /// @brief Constructor /// /// @param name Name of the column. /// @param version Text representation of the schema version in which /// this column first appeared. /// @param default_value The value the column should be assigned if it /// is not present in a data row. It defaults to an empty string, "" VersionedColumn(const std::string& name, const std::string& version, const std::string& default_value = "") : name_(name), version_(version), default_value_(default_value) { }; /// @brief Destructor virtual ~VersionedColumn(){}; /// @brief Name of the column. std::string name_; /// @brief Text representation of the schema version in which /// this column first appeared. std::string version_; /// @brief default_value The value the column should be assigned if it /// is not present in a data row. std::string default_value_; }; /// @brief Defines a smart pointer to VersionedColumn typedef boost::shared_ptr<VersionedColumn> VersionedColumnPtr; /// @brief Implements a CSV file that supports multiple versions of /// the file's "schema". This allows files with older schemas to be /// upgraded to newer schemas as they are being read. The file's schema /// is defined through a list of column descriptors, or @ref /// isc::util::VersionedColumn(s). Each descriptor contains metadata describing /// the column, consisting of the column's name, the version label in which /// the column was added to the schema, and a default value to be used if the /// column is missing from the file. Note that the column descriptors are /// defined in the order they occur in the file, when reading a row from left /// to right. This also assumes that when new version of the schema evolves, /// all new columns are added at the end of the row. In other words, the /// order of the columns reflects not only the order in which they occur /// in a row but also the order they were added to the schema. Conceptually, /// the entire list of columns defined constitutes the current schema. Earlier /// schema versions are therefore subsets of this list. Creating the schema /// is done by calling VersionedCSVfile::addColumn() for each column. Note /// that the schema must be defined prior to opening the file. /// /// The first row of the file is always the header row and is a comma-separated /// list of the names of the column in the file. This row is used when /// opening the file via @ref VersionedCSVFile::open(), to identify its schema /// version so that it may be be read correctly. This is done by comparing /// the column found in the header to the columns defined in the schema. The /// columns must match both by name and the order in which they occur. /// /// -# If there are fewer columns in the header than in the schema, the file /// is presumed to be an earlier schema version and will be upgraded as it is /// read. There is an ability to mark a specific column as being the minimum /// column which must be present, see @ref VersionedCSVFile::setMinimumValidColumns(). /// If the header columns do not match up to this /// minimum column, the file is presumed to be too old to upgrade and the /// open will fail. A valid, upgradable file will have an input schema /// state of VersionedCSVFile::NEEDS_UPGRADE. /// /// -# If there is a mismatch between a found column name and the column name /// defined for that position in the row, the file is presumed to be invalid /// and the open will fail. /// /// -# If the content of the header matches exactly the columns defined in /// the schema, the file is considered to match the schema exactly and the /// input schema state will VersionedCSVFile::CURRENT. /// /// -# If there columns in the header beyond all of the columns defined in /// the schema (i.e the schema is a subset of the header), then the file /// is presumed to be from a newer version of Kea and can be downgraded. The /// input schema state fo the file will be set to /// VersionedCSVFile::NEEDS_DOWNGRADE. /// /// After successfully opening a file, rows are read one at a time via /// @ref VersionedCSVFile::next() and handled according to the input schema /// state. Each data row is expected to have at least the same number of /// columns as were found in the header. Any row which as fewer values is /// discarded as invalid. Similarly, any row which is found to have more /// values than were found in the header is discarded as invalid. /// /// When upgrading a row, the values for each missing column is filled in /// with the default value specified by that column's descriptor. When /// downgrading a row, extraneous values are dropped from the row. /// /// It is important to note that upgrading or downgrading a file does NOT /// alter the physical file itself. Rather the conversion occurs after the /// raw data has been read but before it is passed to caller. /// /// Also note that there is currently no support for writing out a file in /// anything other than the current schema. class VersionedCSVFile : public CSVFile { public: /// @brief Possible input file schema states. /// Used to categorize the input file's schema, relative to the defined /// schema. enum InputSchemaState { CURRENT, NEEDS_UPGRADE, NEEDS_DOWNGRADE }; /// @brief Constructor. /// /// @param filename CSV file name. VersionedCSVFile(const std::string& filename); /// @brief Destructor virtual ~VersionedCSVFile(); /// @brief Adds metadata for a single column to the schema. /// /// This method appends a new column description to the file's schema. /// Note this does not cause anything to be written to the physical file. /// The name of the column will be placed in the CSV header when new file /// is created by calling @c recreate or @c open function. /// /// @param col_name Name of the column. /// @param version Text representation of the schema version in which /// this column first appeared. /// @param default_value value the missing column should be given during /// an upgrade. It defaults to an empty string, "" /// /// @throw CSVFileError if a column with the specified name exists. void addColumn(const std::string& col_name, const std::string& version, const std::string& default_value = ""); /// @brief Sets the minimum number of valid columns based on a given column /// /// @param column_name Name of the column which positionally represents /// the minimum columns which must be present in a file and to be /// considered valid. void setMinimumValidColumns(const std::string& column_name); /// @brief Returns the minimum number of columns which must be present /// for the file to be considered valid. size_t getMinimumValidColumns() const; /// @brief Returns the number of columns found in the input header size_t getInputHeaderCount() const; /// @brief Returns the number of valid columns found in the header /// For newly created files this will always match the number of defined /// columns (i.e. getColumnCount()). For existing files, this will be /// the number of columns in the header that match the defined columns. /// When this number is less than getColumnCount() it means the input file /// is from an earlier schema. This value is zero until the file has /// been opened. size_t getValidColumnCount() const; /// @brief Opens existing file or creates a new one. /// /// This function will try to open existing file if this file has size /// greater than 0. If the file doesn't exist or has size of 0, the /// file is recreated. If the existing file has been opened, the header /// is parsed and and validated against the schema. /// By default, the data pointer in the file is set to the beginning of /// the first data row. In order to retrieve the row contents the @c next /// function should be called. If a @c seek_to_end parameter is set to /// true, the file will be opened and the internal pointer will be set /// to the end of file. /// /// @param seek_to_end A boolean value which indicates if the input and /// output file pointer should be set at the end of file. /// /// @throw VersionedCSVFileError if schema has not been defined, /// CSVFileError when IO operation fails, or header fails to validate. virtual void open(const bool seek_to_end = false); /// @brief Creates a new CSV file. /// /// The file creation will fail if there are no columns specified. /// Otherwise, this function will write the header to the file. /// In order to write rows to opened file, the @c append function /// should be called. /// /// @throw VersionedCSVFileError if schema has not been defined /// CSVFileError if an IO operation fails virtual void recreate(); /// @brief Reads next row from the file file. /// /// This function will return the @c CSVRow object representing a /// parsed row if parsing is successful. If the end of file has been /// reached, the empty row is returned (a row containing no values). /// /// 1. If the row has fewer values than were found in the header it is /// discarded as invalid. /// /// 2. If the row is found to have more values than are defined in the /// schema it is discarded as invalid /// /// When a valid row has fewer than the defined number of columns, the /// values for each missing column is filled in with the default value /// specified by that column's descriptor. /// /// @param [out] row Object receiving the parsed CSV file. /// /// @return true if row has been read and validated; false if validation /// failed. bool next(CSVRow& row); /// @brief Returns the schema version of the physical file /// /// @return text version of the schema found or string "undefined" if the /// file has not been opened std::string getInputSchemaVersion() const; /// @brief text version of current schema supported by the file's metadata /// /// @return text version info assigned to the last column in the list of /// defined column, or the string "undefined" if no columns have been /// defined. std::string getSchemaVersion() const; /// @brief Fetch the column descriptor for a given index /// /// @param index index within the list of columns of the desired column /// @return a pointer to the VersionedColumn at the given index /// @throw OutOfRange exception if the index is invalid const VersionedColumnPtr& getVersionedColumn(const size_t index) const; /// @brief Fetches the state of the input file's schema /// /// Reflects that state of the input file's schema relative to the /// defined schema as a enum, InputSchemaState. /// /// @return VersionedCSVFile::CURRENT if the input file schema matches /// the defined schema, NEEDS_UPGRADE if the input file schema is older, /// and NEEDS_DOWNGRADE if it is newer enum InputSchemaState getInputSchemaState() const; /// @brief Returns true if the input file schema state is not CURRENT bool needsConversion() const; protected: /// @brief Validates the header of a VersionedCSVFile /// /// This function is called internally when the reading in an existing /// file. It parses the header row of the file, comparing each value /// in succession against the defined list of columns. If the header /// contains too few matching columns (i.e. less than @c /// minimum_valid_columns_) or too many (more than the number of defined /// columns), the file is presumed to be either too old, too new, or too /// corrupt to process. Otherwise it retains the number of valid columns /// found and deems the header valid. /// /// @param header A row holding a header. /// @return true if header matches the columns; false otherwise. virtual bool validateHeader(const CSVRow& header); /// @brief Convenience method for adding an error message /// /// Constructs an error message indicating that the number of columns /// in a given row are wrong and why, then adds it readMsg. /// /// @param row The row in error /// @param reason An explanation as to why the row column count is wrong void columnCountError(const CSVRow& row, const std::string& reason); private: /// @brief Holds the collection of column descriptors std::vector<VersionedColumnPtr> columns_; /// @brief Number of valid columns present in input file. If this is less /// than the number of columns defined, this implies the input file is /// from an earlier version of the code. size_t valid_column_count_; /// @brief Minimum number of valid columns an input file must contain. /// If an input file does not meet this number it cannot be upgraded. size_t minimum_valid_columns_; /// @brief The number of columns found in the input header row /// This value represent the number of columns present, in the header /// valid or otherwise. size_t input_header_count_; /// @brief The state of the input schema in relation to the current schema enum InputSchemaState input_schema_state_; }; } // namespace isc::util } // namespace isc #endif // VERSIONED_CSV_FILE_H