From 19fcec84d8d7d21e796c7624e521b60d28ee21ed Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 20:45:59 +0200 Subject: Adding upstream version 16.2.11+ds. Signed-off-by: Daniel Baumann --- src/erasure-code/shec/ErasureCodeShec.cc | 815 +++++++++++++++++++++++++++++++ 1 file changed, 815 insertions(+) create mode 100644 src/erasure-code/shec/ErasureCodeShec.cc (limited to 'src/erasure-code/shec/ErasureCodeShec.cc') diff --git a/src/erasure-code/shec/ErasureCodeShec.cc b/src/erasure-code/shec/ErasureCodeShec.cc new file mode 100644 index 000000000..3634be2fb --- /dev/null +++ b/src/erasure-code/shec/ErasureCodeShec.cc @@ -0,0 +1,815 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 FUJITSU LIMITED + * Copyright (C) 2013,2014 Cloudwatt + * Copyright (C) 2014 Red Hat + * + * Author: Takanori Nakao + * Author: Takeshi Miyamae + * Author: Loic Dachary + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include "common/debug.h" +#include "ErasureCodeShec.h" +extern "C" { +#include "jerasure/include/jerasure.h" +#include "jerasure/include/galois.h" + +extern int calc_determinant(int *matrix, int dim); +extern int* reed_sol_vandermonde_coding_matrix(int k, int m, int w); +} + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix _prefix(_dout) + +using namespace std; +using namespace ceph; + + +static ostream& _prefix(std::ostream* _dout) +{ + return *_dout << "ErasureCodeShec: "; +} + +int ErasureCodeShec::init(ErasureCodeProfile &profile, + ostream *ss) +{ + int err = 0; + err |= parse(profile); + if (err) + return err; + prepare(); + return ErasureCode::init(profile, ss); +} + +unsigned int ErasureCodeShec::get_chunk_size(unsigned int object_size) const +{ + unsigned alignment = get_alignment(); + unsigned tail = object_size % alignment; + unsigned padded_length = object_size + ( tail ? ( alignment - tail ) : 0 ); + + ceph_assert(padded_length % k == 0); + return padded_length / k; +} + +int ErasureCodeShec::_minimum_to_decode(const set &want_to_read, + const set &available_chunks, + set *minimum_chunks) +{ + if (!minimum_chunks) return -EINVAL; + + for (set::iterator it = available_chunks.begin(); it != available_chunks.end(); ++it){ + if (*it < 0 || k+m <= *it) return -EINVAL; + } + + for (set::iterator it = want_to_read.begin(); it != want_to_read.end(); ++it){ + if (*it < 0 || k+m <= *it) return -EINVAL; + } + + int want[k + m]; + int avails[k + m]; + int minimum[k + m]; + + memset(want, 0, sizeof(want)); + memset(avails, 0, sizeof(avails)); + memset(minimum, 0, sizeof(minimum)); + (*minimum_chunks).clear(); + + for (set::const_iterator i = want_to_read.begin(); + i != want_to_read.end(); + ++i) { + want[*i] = 1; + } + + for (set::const_iterator i = available_chunks.begin(); + i != available_chunks.end(); + ++i) { + avails[*i] = 1; + } + + { + int decoding_matrix[k*k]; + int dm_row[k]; + int dm_column[k]; + memset(decoding_matrix, 0, sizeof(decoding_matrix)); + memset(dm_row, 0, sizeof(dm_row)); + memset(dm_column, 0, sizeof(dm_column)); + if (shec_make_decoding_matrix(true, want, avails, decoding_matrix, dm_row, dm_column, minimum) < 0) { + return -EIO; + } + } + + for (int i = 0; i < k + m; i++) { + if (minimum[i] == 1) minimum_chunks->insert(i); + } + + return 0; +} + +int ErasureCodeShec::minimum_to_decode_with_cost(const set &want_to_read, + const map &available, + set *minimum_chunks) +{ + set available_chunks; + + for (map::const_iterator i = available.begin(); + i != available.end(); + ++i) + available_chunks.insert(i->first); + + return _minimum_to_decode(want_to_read, available_chunks, minimum_chunks); +} + +int ErasureCodeShec::encode(const set &want_to_encode, + const bufferlist &in, + map *encoded) +{ + unsigned int k = get_data_chunk_count(); + unsigned int m = get_chunk_count() - k; + bufferlist out; + + if (!encoded || !encoded->empty()){ + return -EINVAL; + } + + int err = encode_prepare(in, *encoded); + if (err) + return err; + encode_chunks(want_to_encode, encoded); + for (unsigned int i = 0; i < k + m; i++) { + if (want_to_encode.count(i) == 0) + encoded->erase(i); + } + return 0; +} + +int ErasureCodeShec::encode_chunks(const set &want_to_encode, + map *encoded) +{ + char *chunks[k + m]; + for (int i = 0; i < k + m; i++){ + chunks[i] = (*encoded)[i].c_str(); + } + shec_encode(&chunks[0], &chunks[k], (*encoded)[0].length()); + return 0; +} + +int ErasureCodeShec::_decode(const set &want_to_read, + const map &chunks, + map *decoded) +{ + vector have; + + if (!decoded || !decoded->empty()){ + return -EINVAL; + } + if (!want_to_read.empty() && chunks.empty()) { + // i need to get the blocksize from the first element of chunks + return -1; + } + + have.reserve(chunks.size()); + for (map::const_iterator i = chunks.begin(); + i != chunks.end(); + ++i) { + have.push_back(i->first); + } + if (includes( + have.begin(), have.end(), want_to_read.begin(), want_to_read.end())) { + for (set::iterator i = want_to_read.begin(); + i != want_to_read.end(); + ++i) { + (*decoded)[*i] = chunks.find(*i)->second; + } + return 0; + } + unsigned int k = get_data_chunk_count(); + unsigned int m = get_chunk_count() - k; + unsigned blocksize = (*chunks.begin()).second.length(); + for (unsigned int i = 0; i < k + m; i++) { + if (chunks.find(i) == chunks.end()) { + bufferlist tmp; + bufferptr ptr(buffer::create_aligned(blocksize, SIMD_ALIGN)); + tmp.push_back(ptr); + tmp.claim_append((*decoded)[i]); + (*decoded)[i].swap(tmp); + } else { + (*decoded)[i] = chunks.find(i)->second; + (*decoded)[i].rebuild_aligned(SIMD_ALIGN); + } + } + return decode_chunks(want_to_read, chunks, decoded); +} + +int ErasureCodeShec::decode_chunks(const set &want_to_read, + const map &chunks, + map *decoded) +{ + unsigned blocksize = (*chunks.begin()).second.length(); + int erased[k + m]; + int erased_count = 0; + int avails[k + m]; + char *data[k]; + char *coding[m]; + + for (int i = 0; i < k + m; i++) { + erased[i] = 0; + if (chunks.find(i) == chunks.end()) { + if (want_to_read.count(i) > 0) { + erased[i] = 1; + erased_count++; + } + avails[i] = 0; + } else { + avails[i] = 1; + } + if (i < k) + data[i] = (*decoded)[i].c_str(); + else + coding[i - k] = (*decoded)[i].c_str(); + } + + if (erased_count > 0) { + return shec_decode(erased, avails, data, coding, blocksize); + } else { + return 0; + } +} + +// +// ErasureCodeShecReedSolomonVandermonde +// + +void ErasureCodeShecReedSolomonVandermonde::shec_encode(char **data, + char **coding, + int blocksize) +{ + jerasure_matrix_encode(k, m, w, matrix, data, coding, blocksize); +} + +int ErasureCodeShecReedSolomonVandermonde::shec_decode(int *erased, + int *avails, + char **data, + char **coding, + int blocksize) +{ + return shec_matrix_decode(erased, avails, data, coding, blocksize); +} + +unsigned ErasureCodeShecReedSolomonVandermonde::get_alignment() const +{ + return k*w*sizeof(int); +} + +int ErasureCodeShecReedSolomonVandermonde::parse(const ErasureCodeProfile &profile) +{ + int err = 0; + // k, m, c + if (profile.find("k") == profile.end() && + profile.find("m") == profile.end() && + profile.find("c") == profile.end()){ + dout(10) << "(k, m, c) default to " << "(" << DEFAULT_K + << ", " << DEFAULT_M << ", " << DEFAULT_C << ")" << dendl; + k = DEFAULT_K; m = DEFAULT_M; c = DEFAULT_C; + } else if (profile.find("k") == profile.end() || + profile.find("m") == profile.end() || + profile.find("c") == profile.end()){ + dout(10) << "(k, m, c) must be chosen" << dendl; + err = -EINVAL; + } else { + std::string err_k, err_m, err_c, value_k, value_m, value_c; + value_k = profile.find("k")->second; + value_m = profile.find("m")->second; + value_c = profile.find("c")->second; + k = strict_strtol(value_k.c_str(), 10, &err_k); + m = strict_strtol(value_m.c_str(), 10, &err_m); + c = strict_strtol(value_c.c_str(), 10, &err_c); + + if (!err_k.empty() || !err_m.empty() || !err_c.empty()){ + if (!err_k.empty()){ + derr << "could not convert k=" << value_k << "to int" << dendl; + } else if (!err_m.empty()){ + derr << "could not convert m=" << value_m << "to int" << dendl; + } else if (!err_c.empty()){ + derr << "could not convert c=" << value_c << "to int" << dendl; + } + err = -EINVAL; + } else if (k <= 0){ + derr << "k=" << k + << " must be a positive number" << dendl; + err = -EINVAL; + } else if (m <= 0){ + derr << "m=" << m + << " must be a positive number" << dendl; + err = -EINVAL; + } else if (c <= 0){ + derr << "c=" << c + << " must be a positive number" << dendl; + err = -EINVAL; + } else if (m < c){ + derr << "c=" << c + << " must be less than or equal to m=" << m << dendl; + err = -EINVAL; + } else if (k > 12){ + derr << "k=" << k + << " must be less than or equal to 12" << dendl; + err = -EINVAL; + } else if (k+m > 20){ + derr << "k+m=" << k+m + << " must be less than or equal to 20" << dendl; + err = -EINVAL; + } else if (ksecond; + w = strict_strtol(value_w.c_str(), 10, &err_w); + + if (!err_w.empty()){ + derr << "could not convert w=" << value_w << "to int" << dendl; + dout(10) << "w default to " << DEFAULT_W << dendl; + w = DEFAULT_W; + + } else if (w != 8 && w != 16 && w != 32) { + derr << "w=" << w + << " must be one of {8, 16, 32}" << dendl; + dout(10) << "w default to " << DEFAULT_W << dendl; + w = DEFAULT_W; + + } else { + dout(10) << "w set to " << w << dendl; + } + } + return 0; +} + +void ErasureCodeShecReedSolomonVandermonde::prepare() +{ + // setup shared encoding table + int** p_enc_table = + tcache.getEncodingTable(technique, k, m, c, w); + + if (!*p_enc_table) { + dout(10) << "[ cache tables ] creating coeff for k=" << + k << " m=" << m << " c=" << c << " w=" << w << dendl; + + matrix = shec_reedsolomon_coding_matrix(technique); + + // either our new created table is stored or if it has been + // created in the meanwhile the locally allocated table will be + // freed by setEncodingTable + matrix = tcache.setEncodingTable(technique, k, m, c, w, matrix); + + dout(10) << "matrix = " << dendl; + for (int i=0; i 0) { + mat[j] = '1'; + } else { + mat[j] = '0'; + } + } + mat[k] = '\0'; + dout(10) << mat << dendl; + } + } else { + matrix = *p_enc_table; + } + + dout(10) << " [ technique ] = " << + ((technique == MULTIPLE) ? "multiple" : "single") << dendl; + + ceph_assert((technique == SINGLE) || (technique == MULTIPLE)); + +} + +// ErasureCodeShec:: +// Mearged from shec.cc. + +double ErasureCodeShec::shec_calc_recovery_efficiency1(int k, int m1, int m2, int c1, int c2){ + int r_eff_k[k]; + double r_e1; + int i, rr, cc, start, end; + int first_flag; + + if (m1 < c1 || m2 < c2) return -1; + if ((m1 == 0 && c1 != 0) || (m2 == 0 && c2 != 0)) return -1; + + for (i=0; i std::numeric_limits::epsilon() && + r_e1 < min_r_e1) { + min_r_e1 = r_e1; + c1_best = c1; + m1_best = m1; + } + } + } + } + m1 = m1_best; + c1 = c1_best; + m2 = m - m1_best; + c2 = c - c1_best; + } else { + m1 = 0; + c1 = 0; + m2 = m; + c2 = c; + } + + // create matrix + matrix = reed_sol_vandermonde_coding_matrix(k, m, w); + + for (rr=0; rr 0) { + want[j] = 1; + } + } + } + } + + if (tcache.getDecodingTableFromCache(decoding_matrix, + dm_row, dm_column, minimum, + technique, + k, m, c, w, + want, avails)) { + return 0; + } + + for (unsigned long long pp = 0; pp < (1ull << m); ++pp) { + + // select parity chunks + int ek = 0; + int p[m]; + for (int i=0; i < m; ++i) { + if (pp & (1ull << i)) { + p[ek++] = i; + } + } + if (ek > minp) { + continue; + } + + // Are selected parity chunks avail? + bool ok = true; + for (int i = 0; i < ek && ok; i++) { + if (!avails[k+p[i]]) { + ok = false; + break; + } + } + + if (!ok) { + continue; + } + + int tmprow[k + m]; + int tmpcolumn[k]; + for (int i = 0; i < k + m; i++) { + tmprow[i] = 0; + } + for (int i = 0; i < k; i++) { + tmpcolumn[i] = 0; + } + + for (int i=0; i < k; i++) { + if (want[i] && !avails[i]) { + tmpcolumn[i] = 1; + } + } + + // Parity chunks which are used to recovery erased data chunks, are added to tmprow. + for (int i = 0; i < ek; i++) { + tmprow[k + p[i]] = 1; + for (int j = 0; j < k; j++) { + int element = matrix[(p[i]) * k + j]; + if (element != 0) { + tmpcolumn[j] = 1; + } + if (element != 0 && avails[j] == 1) { + tmprow[j] = 1; + } + } + } + + int dup_row = 0, dup_column = 0, dup = 0; + for (int i = 0; i < k + m; i++) { + if (tmprow[i]) { + dup_row++; + } + } + + for (int i = 0; i < k; i++) { + if (tmpcolumn[i]) { + dup_column++; + } + } + + if (dup_row != dup_column) { + continue; + } + dup = dup_row; + if (dup == 0) { + mindup = dup; + for (int i = 0; i < k; i++) { + dm_row[i] = -1; + } + for (int i = 0; i < k; i++) { + dm_column[i] = -1; + } + break; + } + + // minimum is updated. + if (dup < mindup) { + int tmpmat[dup * dup]; + { + for (int i = 0, row = 0; i < k + m; i++) { + if (tmprow[i]) { + for (int j = 0, column = 0; j < k; j++) { + if (tmpcolumn[j]) { + if (i < k) { + tmpmat[row * dup + column] = (i == j ? 1 : 0); + } else { + tmpmat[row * dup + column] = matrix[(i - k) * k + j]; + } + column++; + } + } + row++; + } + } + } + int det = calc_determinant(tmpmat, dup); + + if (det != 0) { + int row_id = 0; + int column_id = 0; + for (int i = 0; i < k; i++) { + dm_row[i] = -1; + } + for (int i = 0; i < k; i++) { + dm_column[i] = -1; + } + + mindup = dup; + for (int i=0; i < k + m; i++) { + if (tmprow[i]) { + dm_row[row_id++] = i; + } + } + for (int i=0; i < k; i++) { + if (tmpcolumn[i]) { + dm_column[column_id++] = i; + } + } + minp = ek; + } + } + } + + + if (mindup == k+1) { + dout(10) << __func__ << ": can't find recover matrix." << dendl; + return -1; + } + + for (int i = 0; i < k + m; i++) { + minimum[i] = 0; + } + + for (int i=0; i < k && dm_row[i] != -1; i++) { + minimum[dm_row[i]] = 1; + } + + for (int i = 0; i < k; ++i) { + if (want[i] && avails[i]) { + minimum[i] = 1; + } + } + + for (int i = 0; i < m; ++i) { + if (want[k + i] && avails[k + i] && !minimum[k + i]) { + for (int j = 0; j < k; ++j) { + if (matrix[i * k + j] > 0 && !want[j]) { + minimum[k + i] = 1; + break; + } + } + } + } + + if (mindup == 0) { + return 0; + } + + int tmpmat[mindup * mindup]; + for (int i=0; i < mindup; i++) { + for (int j=0; j < mindup; j++) { + if (dm_row[i] < k) { + tmpmat[i * mindup + j] = (dm_row[i] == dm_column[j] ? 1 : 0); + } else { + tmpmat[i * mindup + j] = matrix[(dm_row[i] - k) * k + dm_column[j]]; + } + } + if (dm_row[i] < k) { + for (int j = 0; j < mindup; j++) { + if (dm_row[i] == dm_column[j]) { + dm_row[i] = j; + } + } + } else { + dm_row[i] -= (k - mindup); + } + } + + if (prepare) { + return 0; + } + + int ret = jerasure_invert_matrix(tmpmat, decoding_matrix, mindup, w); + + tcache.putDecodingTableToCache(decoding_matrix, dm_row, dm_column, minimum, technique, + k, m, c, w, want, avails); + + return ret; +} + +int ErasureCodeShec::shec_matrix_decode(int *want, int *avails, char **data_ptrs, + char **coding_ptrs, int size) +{ + int decoding_matrix[k*k]; + int dm_row[k], dm_column[k]; + int minimum[k + m]; + + memset(decoding_matrix, 0, sizeof(decoding_matrix)); + memset(dm_row, -1, sizeof(dm_row)); + memset(dm_column, -1, sizeof(dm_column)); + memset(minimum, -1, sizeof(minimum)); + + if (w != 8 && w != 16 && w != 32) return -1; + + if (shec_make_decoding_matrix(false, want, avails, decoding_matrix, + dm_row, dm_column, minimum) < 0) { + return -1; + } + + // Get decoding matrix size + int dm_size = 0; + for (int i = 0; i < k; i++) { + if (dm_row[i] == -1) { + break; + } + dm_size++; + } + + char *dm_data_ptrs[dm_size]; + for (int i = 0; i < dm_size; i++) { + dm_data_ptrs[i] = data_ptrs[dm_column[i]]; + } + + // Decode the data drives + for (int i = 0; i < dm_size; i++) { + if (!avails[dm_column[i]]) { + jerasure_matrix_dotprod(dm_size, w, decoding_matrix + (i * dm_size), + dm_row, i, dm_data_ptrs, coding_ptrs, size); + } + } + + // Re-encode any erased coding devices + for (int i = 0; i < m; i++) { + if (want[k+i] && !avails[k+i]) { + jerasure_matrix_dotprod(k, w, matrix + (i * k), NULL, i+k, + data_ptrs, coding_ptrs, size); + } + } + + return 0; +} -- cgit v1.2.3