diff options
Diffstat (limited to '')
-rw-r--r-- | src/erasure-code/ErasureCodeInterface.h | 468 |
1 files changed, 468 insertions, 0 deletions
diff --git a/src/erasure-code/ErasureCodeInterface.h b/src/erasure-code/ErasureCodeInterface.h new file mode 100644 index 00000000..b0c24e1e --- /dev/null +++ b/src/erasure-code/ErasureCodeInterface.h @@ -0,0 +1,468 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph distributed storage system + * + * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com> + * + * Author: Loic Dachary <loic@dachary.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +#ifndef CEPH_ERASURE_CODE_INTERFACE_H +#define CEPH_ERASURE_CODE_INTERFACE_H + +/*! @file ErasureCodeInterface.h + @brief Interface provided by erasure code plugins + + The erasure coded pools rely on plugins implementing + **ErasureCodeInterface** to encode and decode content. All codes + are systematic (i.e. the data is not mangled and can be + reconstructed by concatenating chunks ). + + Methods returning an **int** return **0** on success and a + negative value on error. If the value returned on error is not + explained in **ErasureCodeInterface**, the sources or the + documentation of the interface implementer (i.e. the plugin ) must + be read to figure out what it means. It is recommended that each + error code matches an *errno* value that relates to the cause of + the error. + + If an object is small enough, the caller can process it with + one call to the **encode** or **decode** method. + + +---------------- coded object O -------------------------+ + |+----------------+ +----------------+ +----------------+ | + || chunk 0 | | chunk 1 | | chunk 2 | | + || [0,N) | | [N,2N) | | [2N,3N) | | + |+----------------+ +----------------+ +----------------+ | + +------^--------------------------------------------------+ + | + chunk B / C | offset B % C ( where C is the chunk size ) + | + +-----^---- raw object O ----+------+ + | B [0,X) | pad | + +----------------------------+------+ + + The object size is paded so that each chunks are of the same size. + In the example above, if the actual object size was X, then it + will be padded to 2N >= X assuming there are two data chunks (0 + and 1) and one coding chunk (2). + + For chunks of size C, byte B of the object is found in chunk number + B / C at offset B % C. + + If an object is too large to be encoded in memory, the caller + should divide it in smaller units named **stripes**. + + +---------------------- object O -------------------------+ + |+----------------+ +----------------+ +----------------+ | + stripe || chunk 0 | | chunk 1 | | chunk 2 | | + 0 || [0,N) | | [N,2N) | | [2N,3N) | | + |+----------------+ +----------------+ +----------------+ | + |+----------------+ +----------------+ +----------------+ | + stripe || chunk 0 | | chunk 1 | | chunk 2 | | + 1 || [X,M) | | [X+M,X+2M) | | [X+2M,X+3M) | | + || | | | | | | + |+----------------+ +----------------+ +----------------+ | + | ... | + +---------------------------------------------------------+ + + The interface does not concern itself with stripes nor does it + impose constraints on the size of each stripe. Variable names in + the interface always use **object** and never use **stripe**. + + Assuming the interface implementer provides three data chunks ( K + = 3 ) and two coding chunks ( M = 2 ), a buffer could be encoded as + follows: + + ~~~~~~~~~~~~~~~~{.c} + set<int> want_to_encode(0, 1, 2, // data chunks + 3, 4 // coding chunks + ); + bufferlist in = "ABCDEF"; + map<int, bufferlist> encoded + encode(want_to_encode, in, &encoded); + encoded[0] == "AB" // data chunk 0 + encoded[1] == "CD" // data chunk 1 + encoded[2] == "EF" // data chunk 2 + encoded[3] // coding chunk 0 + encoded[4] // coding chunk 1 + ~~~~~~~~~~~~~~~~ + + The **minimum_to_decode_with_cost** method can be used to minimize + the cost of fetching the chunks necessary to retrieve a given + content. For instance, if encoded[2] (contained **EF**) is missing + and accessing encoded[3] (the first coding chunk) is more + expensive than accessing encoded[4] (the second coding chunk), + **minimum_to_decode_with_cost** is expected to chose the first + coding chunk. + + ~~~~~~~~~~~~~~~~{.c} + set<int> want_to_read(2); // want the chunk containing "EF" + map<int,int> available( + 0 => 1, // data chunk 0 : available and costs 1 + 1 => 1, // data chunk 1 : available and costs 1 + // data chunk 2 : missing + 3 => 9, // coding chunk 1 : available and costs 9 + 4 => 1, // coding chunk 2 : available and costs 1 + ); + set<int> minimum; + minimum_to_decode_with_cost(want_to_read, + available, + &minimum); + minimum == set<int>(0, 1, 4); // NOT set<int>(0, 1, 3); + ~~~~~~~~~~~~~~~~ + + It sets **minimum** with three chunks to reconstruct the desired + data chunk and will pick the second coding chunk ( 4 ) because it + is less expensive ( 1 < 9 ) to retrieve than the first coding + chunk ( 3 ). The caller is responsible for retrieving the chunks + and call **decode** to reconstruct the second data chunk. + + ~~~~~~~~~~~~~~~~{.c} + map<int,bufferlist> chunks; + for i in minimum.keys(): + chunks[i] = fetch_chunk(i); // get chunk from storage + map<int, bufferlist> decoded; + decode(want_to_read, chunks, &decoded); + decoded[2] == "EF" + ~~~~~~~~~~~~~~~~ + + The semantic of the cost value is defined by the caller and must + be known to the implementer. For instance, it may be more + expensive to retrieve two chunks with cost 1 + 9 = 10 than two + chunks with cost 6 + 6 = 12. + */ + +#include <map> +#include <set> +#include <vector> +#include <ostream> +#include <memory> +#include <string> +#include "include/buffer_fwd.h" + +class CrushWrapper; + +namespace ceph { + + typedef std::map<std::string,std::string> ErasureCodeProfile; + + inline std::ostream& operator<<(std::ostream& out, const ErasureCodeProfile& profile) { + out << "{"; + for (ErasureCodeProfile::const_iterator it = profile.begin(); + it != profile.end(); + ++it) { + if (it != profile.begin()) out << ","; + out << it->first << "=" << it->second; + } + out << "}"; + return out; + } + + + class ErasureCodeInterface { + public: + virtual ~ErasureCodeInterface() {} + + /** + * Initialize the instance according to the content of + * **profile**. The **ss** stream is set with debug messages or + * error messages, the content of which depend on the + * implementation. + * + * Return 0 on success or a negative errno on error. When + * returning on error, the implementation is expected to + * provide a human readable explanation in **ss**. + * + * @param [in] profile a key/value map + * @param [out] ss contains informative messages when an error occurs + * @return 0 on success or a negative errno on error. + */ + virtual int init(ErasureCodeProfile &profile, std::ostream *ss) = 0; + + /** + * Return the profile that was used to initialize the instance + * with the **init** method. + * + * @return the profile in use by the instance + */ + virtual const ErasureCodeProfile &get_profile() const = 0; + + /** + * Create a new rule in **crush** under the name **name**, + * unless it already exists. + * + * Return the rule number that was created on success. If a + * rule **name** already exists, return -EEXISTS, otherwise + * return a negative value indicating an error with a semantic + * defined by the implementation. + * + * @param [in] name of the rule to create + * @param [in] crush crushmap in which the rule is created + * @param [out] ss contains informative messages when an error occurs + * @return a rule on success or a negative errno on error. + */ + virtual int create_rule(const std::string &name, + CrushWrapper &crush, + std::ostream *ss) const = 0; + + /** + * Return the number of chunks created by a call to the **encode** + * method. + * + * In the simplest case it can be K + M, i.e. the number + * of data chunks (K) plus the number of parity chunks + * (M). However, if the implementation provides local parity there + * could be an additional overhead. + * + * @return the number of chunks created by encode() + */ + virtual unsigned int get_chunk_count() const = 0; + + /** + * Return the number of data chunks created by a call to the + * **encode** method. The data chunks contain the buffer provided + * to **encode**, verbatim, with padding at the end of the last + * chunk. + * + * @return the number of data chunks created by encode() + */ + virtual unsigned int get_data_chunk_count() const = 0; + + /** + * Return the number of coding chunks created by a call to the + * **encode** method. The coding chunks are used to recover from + * the loss of one or more chunks. If there is one coding chunk, + * it is possible to recover from the loss of exactly one + * chunk. If there are two coding chunks, it is possible to + * recover from the loss of at most two chunks, etc. + * + * @return the number of coding chunks created by encode() + */ + virtual unsigned int get_coding_chunk_count() const = 0; + + /** + * Return the number of sub chunks chunks created by a call to the + * **encode** method. Each chunk can be viewed as union of sub-chunks + * For the case of array codes, the sub-chunk count > 1, where as the + * scalar codes have sub-chunk count = 1. + * + * @return the number of sub-chunks per chunk created by encode() + */ + virtual int get_sub_chunk_count() = 0; + + /** + * Return the size (in bytes) of a single chunk created by a call + * to the **decode** method. The returned size multiplied by + * **get_chunk_count()** is greater or equal to **object_size**. + * + * If the object size is properly aligned, the chunk size is + * **object_size / get_chunk_count()**. However, if + * **object_size** is not a multiple of **get_chunk_count** or if + * the implementation imposes additional alignment constraints, + * the chunk size may be larger. + * + * The byte found at offset **B** of the original object is mapped + * to chunk **B / get_chunk_size()** at offset **B % get_chunk_size()**. + * + * @param [in] object_size the number of bytes of the object to **encode()** + * @return the size (in bytes) of a single chunk created by **encode()** + */ + virtual unsigned int get_chunk_size(unsigned int object_size) const = 0; + + /** + * Compute the smallest subset of **available** chunks that needs + * to be retrieved in order to successfully decode + * **want_to_read** chunks. + * + * It is strictly equivalent to calling + * **minimum_to_decode_with_cost** where each **available** chunk + * has the same cost. + * + * @see minimum_to_decode_with_cost + * + * @param [in] want_to_read chunk indexes to be decoded + * @param [in] available chunk indexes containing valid data + * @param [out] minimum chunk indexes and corresponding + * subchunk index offsets, count. + * @return **0** on success or a negative errno on error. + */ + virtual int minimum_to_decode(const std::set<int> &want_to_read, + const std::set<int> &available, + std::map<int, std::vector<std::pair<int, int>>> + *minimum) = 0; + + /** + * Compute the smallest subset of **available** chunks that needs + * to be retrieved in order to successfully decode + * **want_to_read** chunks. If there are more than one possible + * subset, select the subset that minimizes the overall retrieval + * cost. + * + * The **available** parameter maps chunk indexes to their + * retrieval cost. The higher the cost value, the more costly it + * is to retrieve the chunk content. + * + * Returns -EIO if there are not enough chunk indexes in + * **available** to decode **want_to_read**. + * + * Returns 0 on success. + * + * The **minimum** argument must be a pointer to an empty set. + * + * @param [in] want_to_read chunk indexes to be decoded + * @param [in] available map chunk indexes containing valid data + * to their retrieval cost + * @param [out] minimum chunk indexes to retrieve + * @return **0** on success or a negative errno on error. + */ + virtual int minimum_to_decode_with_cost(const std::set<int> &want_to_read, + const std::map<int, int> &available, + std::set<int> *minimum) = 0; + + /** + * Encode the content of **in** and store the result in + * **encoded**. All buffers pointed to by **encoded** have the + * same size. The **encoded** map contains at least all chunk + * indexes found in the **want_to_encode** set. + * + * The **encoded** map is expected to be a pointer to an empty + * map. + * + * Assuming the **in** parameter is **length** bytes long, + * the concatenation of the first **length** bytes of the + * **encoded** buffers is equal to the content of the **in** + * parameter. + * + * The **encoded** map may contain more chunks than required by + * **want_to_encode** and the caller is expected to permanently + * store all of them, not just the chunks listed in + * **want_to_encode**. + * + * The **encoded** map may contain pointers to data stored in + * the **in** parameter. If the caller modifies the content of + * **in** after calling the encode method, it may have a side + * effect on the content of **encoded**. + * + * The **encoded** map may contain pointers to buffers allocated + * by the encode method. They will be freed when **encoded** is + * freed. The allocation method is not specified. + * + * Returns 0 on success. + * + * @param [in] want_to_encode chunk indexes to be encoded + * @param [in] in data to be encoded + * @param [out] encoded map chunk indexes to chunk data + * @return **0** on success or a negative errno on error. + */ + virtual int encode(const std::set<int> &want_to_encode, + const bufferlist &in, + std::map<int, bufferlist> *encoded) = 0; + + + virtual int encode_chunks(const std::set<int> &want_to_encode, + std::map<int, bufferlist> *encoded) = 0; + + /** + * Decode the **chunks** and store at least **want_to_read** + * chunks in **decoded**. + * + * The **decoded** map must be a pointer to an empty map. + * + * There must be enough **chunks** ( as returned by + * **minimum_to_decode** or **minimum_to_decode_with_cost** ) to + * perform a successful decoding of all chunks listed in + * **want_to_read**. + * + * All buffers pointed by **in** must have the same size. + * + * On success, the **decoded** map may contain more chunks than + * required by **want_to_read** and they can safely be used by the + * caller. + * + * If a chunk is listed in **want_to_read** and there is a + * corresponding **bufferlist** in **chunks**, it will be + * referenced in **decoded**. If not it will be reconstructed from + * the existing chunks. + * + * Because **decoded** may contain pointers to data found in + * **chunks**, modifying the content of **chunks** after calling + * decode may have a side effect on the content of **decoded**. + * + * Returns 0 on success. + * + * @param [in] want_to_read chunk indexes to be decoded + * @param [in] chunks map chunk indexes to chunk data + * @param [out] decoded map chunk indexes to chunk data + * @param [in] chunk_size chunk size + * @return **0** on success or a negative errno on error. + */ + virtual int decode(const std::set<int> &want_to_read, + const std::map<int, bufferlist> &chunks, + std::map<int, bufferlist> *decoded, int chunk_size) = 0; + + virtual int decode_chunks(const std::set<int> &want_to_read, + const std::map<int, bufferlist> &chunks, + std::map<int, bufferlist> *decoded) = 0; + + /** + * Return the ordered list of chunks or an empty vector + * if no remapping is necessary. + * + * By default encoding an object with K=2,M=1 will create three + * chunks, the first two are data and the last one coding. For + * a 10MB object, it would be: + * + * chunk 0 for the first 5MB + * chunk 1 for the last 5MB + * chunk 2 for the 5MB coding chunk + * + * The plugin may, however, decide to remap them in a different + * order, such as: + * + * chunk 0 for the last 5MB + * chunk 1 for the 5MB coding chunk + * chunk 2 for the first 5MB + * + * The vector<int> remaps the chunks so that the first chunks are + * data, in sequential order, and the last chunks contain parity + * in the same order as they were output by the encoding function. + * + * In the example above the mapping would be: + * + * [ 1, 2, 0 ] + * + * The returned vector<int> only contains information for chunks + * that need remapping. If no remapping is necessary, the + * vector<int> is empty. + * + * @return vector<int> list of indices of chunks to be remapped + */ + virtual const std::vector<int> &get_chunk_mapping() const = 0; + + /** + * Decode the first **get_data_chunk_count()** **chunks** and + * concatenate them into **decoded**. + * + * Returns 0 on success. + * + * @param [in] chunks map chunk indexes to chunk data + * @param [out] decoded concatenante of the data chunks + * @return **0** on success or a negative errno on error. + */ + virtual int decode_concat(const std::map<int, bufferlist> &chunks, + bufferlist *decoded) = 0; + }; + + typedef std::shared_ptr<ErasureCodeInterface> ErasureCodeInterfaceRef; + +} + +#endif |