summaryrefslogtreecommitdiffstats
path: root/src/mon/PGMap.h
blob: 2e3df3fa8b609472ad510adc9211d5b0cf47420e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
// vim: ts=8 sw=2 smarttab
/*
 * Ceph - scalable distributed file system
 *
 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
 *
 * This is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License version 2.1, as published by the Free Software 
 * Foundation.  See file COPYING.
 * 
 */
 
/*
 * Placement Group Map. Placement Groups are logical sets of objects
 * that are replicated by the same set of devices. pgid=(r,hash(o)&m)
 * where & is a bit-wise AND and m=2^k-1
 */

#ifndef CEPH_PGMAP_H
#define CEPH_PGMAP_H

#include "include/health.h"
#include "common/debug.h"
#include "common/TextTable.h"
#include "osd/osd_types.h"
#include "include/mempool.h"
#include "mon/health_check.h"
#include <sstream>

namespace ceph { class Formatter; }

class PGMapDigest {
public:
  MEMPOOL_CLASS_HELPERS();
  virtual ~PGMapDigest() {}

  mempool::pgmap::vector<uint64_t> osd_last_seq;

  mutable std::map<int, int64_t> avail_space_by_rule;

  // aggregate state, populated by PGMap child
  int64_t num_pg = 0, num_osd = 0;
  int64_t num_pg_active = 0;
  int64_t num_pg_unknown = 0;
  mempool::pgmap::unordered_map<int32_t,pool_stat_t> pg_pool_sum;
  mempool::pgmap::map<int64_t,int64_t> num_pg_by_pool;
  pool_stat_t pg_sum;
  osd_stat_t osd_sum;
  mempool::pgmap::map<std::string,osd_stat_t> osd_sum_by_class;
  mempool::pgmap::unordered_map<uint64_t,int32_t> num_pg_by_state;
  struct pg_count {
    int32_t acting = 0;
    int32_t up_not_acting = 0;
    int32_t primary = 0;
    void encode(ceph::buffer::list& bl) const {
      using ceph::encode;
      encode(acting, bl);
      encode(up_not_acting, bl);
      encode(primary, bl);
    }
    void decode(ceph::buffer::list::const_iterator& p) {
      using ceph::decode;
      decode(acting, p);
      decode(up_not_acting, p);
      decode(primary, p);
    }
  };
  mempool::pgmap::unordered_map<int32_t,pg_count> num_pg_by_osd;

  mempool::pgmap::map<int64_t,interval_set<snapid_t>> purged_snaps;

  bool use_per_pool_stats() const {
    return osd_sum.num_osds == osd_sum.num_per_pool_osds;
  }

  // recent deltas, and summation
  /**
   * keep track of last deltas for each pool, calculated using
   * @p pg_pool_sum as baseline.
   */
  mempool::pgmap::unordered_map<int64_t, mempool::pgmap::list<std::pair<pool_stat_t, utime_t> > > per_pool_sum_deltas;
  /**
   * keep track of per-pool timestamp deltas, according to last update on
   * each pool.
   */
  mempool::pgmap::unordered_map<int64_t, utime_t> per_pool_sum_deltas_stamps;
  /**
   * keep track of sum deltas, per-pool, taking into account any previous
   * deltas existing in @p per_pool_sum_deltas.  The utime_t as second member
   * of the pair is the timestamp referring to the last update (i.e., the first
   * member of the pair) for a given pool.
   */
  mempool::pgmap::unordered_map<int64_t, std::pair<pool_stat_t,utime_t> > per_pool_sum_delta;

  pool_stat_t pg_sum_delta;
  utime_t stamp_delta;

  void get_recovery_stats(
    double *misplaced_ratio,
    double *degraded_ratio,
    double *inactive_ratio,
    double *unknown_pgs_ratio) const;

  void print_summary(ceph::Formatter *f, std::ostream *out) const;
  void print_oneline_summary(ceph::Formatter *f, std::ostream *out) const;

  void recovery_summary(ceph::Formatter *f, std::list<std::string> *psl,
                        const pool_stat_t& pool_sum) const;
  void overall_recovery_summary(ceph::Formatter *f, std::list<std::string> *psl) const;
  void pool_recovery_summary(ceph::Formatter *f, std::list<std::string> *psl,
                             uint64_t poolid) const;
  void recovery_rate_summary(ceph::Formatter *f, std::ostream *out,
                             const pool_stat_t& delta_sum,
                             utime_t delta_stamp) const;
  void overall_recovery_rate_summary(ceph::Formatter *f, std::ostream *out) const;
  void pool_recovery_rate_summary(ceph::Formatter *f, std::ostream *out,
                                  uint64_t poolid) const;
  /**
   * Obtain a formatted/plain output for client I/O, source from stats for a
   * given @p delta_sum pool over a given @p delta_stamp period of time.
   */
  void client_io_rate_summary(ceph::Formatter *f, std::ostream *out,
                              const pool_stat_t& delta_sum,
                              utime_t delta_stamp) const;
  /**
   * Obtain a formatted/plain output for the overall client I/O, which is
   * calculated resorting to @p pg_sum_delta and @p stamp_delta.
   */
  void overall_client_io_rate_summary(ceph::Formatter *f, std::ostream *out) const;
  /**
   * Obtain a formatted/plain output for client I/O over a given pool
   * with id @p pool_id.  We will then obtain pool-specific data
   * from @p per_pool_sum_delta.
   */
  void pool_client_io_rate_summary(ceph::Formatter *f, std::ostream *out,
                                   uint64_t poolid) const;
  /**
   * Obtain a formatted/plain output for cache tier IO, source from stats for a
   * given @p delta_sum pool over a given @p delta_stamp period of time.
   */
  void cache_io_rate_summary(ceph::Formatter *f, std::ostream *out,
                             const pool_stat_t& delta_sum,
                             utime_t delta_stamp) const;
  /**
   * Obtain a formatted/plain output for the overall cache tier IO, which is
   * calculated resorting to @p pg_sum_delta and @p stamp_delta.
   */
  void overall_cache_io_rate_summary(ceph::Formatter *f, std::ostream *out) const;
  /**
   * Obtain a formatted/plain output for cache tier IO over a given pool
   * with id @p pool_id.  We will then obtain pool-specific data
   * from @p per_pool_sum_delta.
   */
  void pool_cache_io_rate_summary(ceph::Formatter *f, std::ostream *out,
                                  uint64_t poolid) const;

  /**
   * Return the number of additional bytes that can be stored in this
   * pool before the first OSD fills up, accounting for PG overhead.
   */
  int64_t get_pool_free_space(const OSDMap &osd_map, int64_t poolid) const;


  /**
   * Dump pool usage and io ops/bytes, used by "ceph df" command
   */
  virtual void dump_pool_stats_full(const OSDMap &osd_map, std::stringstream *ss,
				    ceph::Formatter *f, bool verbose) const;
  void dump_cluster_stats(std::stringstream *ss, ceph::Formatter *f, bool verbose) const;
  static void dump_object_stat_sum(TextTable &tbl, ceph::Formatter *f,
				   const pool_stat_t &pool_stat,
				   uint64_t avail,
				   float raw_used_rate,
				   bool verbose, bool per_pool,
				   const pg_pool_t *pool);

  size_t get_num_pg_by_osd(int osd) const {
    auto p = num_pg_by_osd.find(osd);
    if (p == num_pg_by_osd.end())
      return 0;
    else
      return p->second.acting;
  }
  int get_num_primary_pg_by_osd(int osd) const {
    auto p = num_pg_by_osd.find(osd);
    if (p == num_pg_by_osd.end())
      return 0;
    else
      return p->second.primary;
  }

  ceph_statfs get_statfs(OSDMap &osdmap,
                         boost::optional<int64_t> data_pool) const;

  int64_t get_rule_avail(int ruleno) const {
    auto i = avail_space_by_rule.find(ruleno);
    if (i != avail_space_by_rule.end())
      return avail_space_by_rule[ruleno];
    else
      return 0;
  }

  // kill me post-mimic or -nautilus
  bool definitely_converted_snapsets() const {
    // false negative is okay; false positive is not!
    return
      num_pg &&
      num_pg_unknown == 0 &&
      pg_sum.stats.sum.num_legacy_snapsets == 0;
  }

  uint64_t get_last_osd_stat_seq(int osd) {
    if (osd < (int)osd_last_seq.size())
      return osd_last_seq[osd];
    return 0;
  }

  void encode(ceph::buffer::list& bl, uint64_t features) const;
  void decode(ceph::buffer::list::const_iterator& p);
  void dump(ceph::Formatter *f) const;
  static void generate_test_instances(std::list<PGMapDigest*>& ls);
};
WRITE_CLASS_ENCODER(PGMapDigest::pg_count);
WRITE_CLASS_ENCODER_FEATURES(PGMapDigest);

class PGMap : public PGMapDigest {
public:
  MEMPOOL_CLASS_HELPERS();

  // the map
  version_t version;
  epoch_t last_osdmap_epoch;   // last osdmap epoch i applied to the pgmap
  epoch_t last_pg_scan;  // osdmap epoch
  mempool::pgmap::unordered_map<int32_t,osd_stat_t> osd_stat;
  mempool::pgmap::unordered_map<pg_t,pg_stat_t> pg_stat;

  typedef mempool::pgmap::map<
    std::pair<int64_t, int>,  // <pool, osd>
    store_statfs_t>
      per_osd_pool_statfs_t;

  per_osd_pool_statfs_t pool_statfs;

  class Incremental {
  public:
    MEMPOOL_CLASS_HELPERS();
    version_t version;
    mempool::pgmap::map<pg_t,pg_stat_t> pg_stat_updates;
    epoch_t osdmap_epoch;
    epoch_t pg_scan;  // osdmap epoch
    mempool::pgmap::set<pg_t> pg_remove;
    utime_t stamp;
    per_osd_pool_statfs_t pool_statfs_updates;

  private:
    mempool::pgmap::map<int32_t,osd_stat_t> osd_stat_updates;
    mempool::pgmap::set<int32_t> osd_stat_rm;
  public:

    const mempool::pgmap::map<int32_t, osd_stat_t> &get_osd_stat_updates() const {
      return osd_stat_updates;
    }
    const mempool::pgmap::set<int32_t> &get_osd_stat_rm() const {
      return osd_stat_rm;
    }
    template<typename OsdStat>
    void update_stat(int32_t osd, OsdStat&& stat) {
      osd_stat_updates[osd] = std::forward<OsdStat>(stat);
    }
    void stat_osd_out(int32_t osd) {
      osd_stat_updates[osd] = osd_stat_t();
    }
    void stat_osd_down_up(int32_t osd, const PGMap& pg_map) {
      // 0 the op_queue_age_hist for this osd
      auto p = osd_stat_updates.find(osd);
      if (p != osd_stat_updates.end()) {
	p->second.op_queue_age_hist.clear();
	return;
      }
      auto q = pg_map.osd_stat.find(osd);
      if (q != pg_map.osd_stat.end()) {
	osd_stat_t& t = osd_stat_updates[osd] = q->second;
	t.op_queue_age_hist.clear();
      }
    }
    void rm_stat(int32_t osd) {
      osd_stat_rm.insert(osd);
      osd_stat_updates.erase(osd);
    }
    void dump(ceph::Formatter *f) const;
    static void generate_test_instances(std::list<Incremental*>& o);

    Incremental() : version(0), osdmap_epoch(0), pg_scan(0) {}
  };


  // aggregate stats (soft state), generated by calc_stats()
  mempool::pgmap::unordered_map<int,std::set<pg_t> > pg_by_osd;
  mempool::pgmap::unordered_map<int,int> blocked_by_sum;
  mempool::pgmap::list<std::pair<pool_stat_t, utime_t> > pg_sum_deltas;
  mempool::pgmap::unordered_map<int64_t,mempool::pgmap::unordered_map<uint64_t,int32_t>> num_pg_by_pool_state;

  utime_t stamp;

  void update_pool_deltas(
    CephContext *cct,
    const utime_t ts,
    const mempool::pgmap::unordered_map<int32_t, pool_stat_t>& pg_pool_sum_old);
  void clear_delta();

  void deleted_pool(int64_t pool) {
    for (auto i = pool_statfs.begin();  i != pool_statfs.end();) {
      if (i->first.first == pool) {
	i = pool_statfs.erase(i);
      } else {
        ++i;
      }
    }

    pg_pool_sum.erase(pool);
    num_pg_by_pool_state.erase(pool);
    num_pg_by_pool.erase(pool);
    per_pool_sum_deltas.erase(pool);
    per_pool_sum_deltas_stamps.erase(pool);
    per_pool_sum_delta.erase(pool);
  }

 private:
  void update_delta(
    CephContext *cct,
    const utime_t ts,
    const pool_stat_t& old_pool_sum,
    utime_t *last_ts,
    const pool_stat_t& current_pool_sum,
    pool_stat_t *result_pool_delta,
    utime_t *result_ts_delta,
    mempool::pgmap::list<std::pair<pool_stat_t,utime_t> > *delta_avg_list);

  void update_one_pool_delta(CephContext *cct,
                             const utime_t ts,
                             const int64_t pool,
                             const pool_stat_t& old_pool_sum);

 public:

  mempool::pgmap::set<pg_t> creating_pgs;
  mempool::pgmap::map<int,std::map<epoch_t,std::set<pg_t> > > creating_pgs_by_osd_epoch;

  // Bits that use to be enum StuckPG
  static const int STUCK_INACTIVE = (1<<0);
  static const int STUCK_UNCLEAN = (1<<1);
  static const int STUCK_UNDERSIZED = (1<<2);
  static const int STUCK_DEGRADED = (1<<3);
  static const int STUCK_STALE = (1<<4);
  
  PGMap()
    : version(0),
      last_osdmap_epoch(0), last_pg_scan(0)
  {}

  version_t get_version() const {
    return version;
  }
  void set_version(version_t v) {
    version = v;
  }
  epoch_t get_last_osdmap_epoch() const {
    return last_osdmap_epoch;
  }
  void set_last_osdmap_epoch(epoch_t e) {
    last_osdmap_epoch = e;
  }
  epoch_t get_last_pg_scan() const {
    return last_pg_scan;
  }
  void set_last_pg_scan(epoch_t e) {
    last_pg_scan = e;
  }
  utime_t get_stamp() const {
    return stamp;
  }
  void set_stamp(utime_t s) {
    stamp = s;
  }

  pool_stat_t get_pg_pool_sum_stat(int64_t pool) const {
    auto p = pg_pool_sum.find(pool);
    if (p != pg_pool_sum.end())
      return p->second;
    return pool_stat_t();
  }

  osd_stat_t get_osd_sum(const std::set<int>& osds) const {
    if (osds.empty()) // all
      return osd_sum;
    osd_stat_t sum;
    for (auto i : osds) {
      auto os = get_osd_stat(i);
      if (os)
        sum.add(*os);
    }
    return sum;
  }

  const osd_stat_t *get_osd_stat(int osd) const {
    auto i = osd_stat.find(osd);
    if (i == osd_stat.end()) {
      return nullptr;
    }
    return &i->second;
  }


  void apply_incremental(CephContext *cct, const Incremental& inc);
  void calc_stats();
  void stat_pg_add(const pg_t &pgid, const pg_stat_t &s,
		   bool sameosds=false);
  bool stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
		   bool sameosds=false);
  void calc_purged_snaps();
  void calc_osd_sum_by_class(const OSDMap& osdmap);
  void stat_osd_add(int osd, const osd_stat_t &s);
  void stat_osd_sub(int osd, const osd_stat_t &s);
  
  void encode(ceph::buffer::list &bl, uint64_t features=-1) const;
  void decode(ceph::buffer::list::const_iterator &bl);

  /// encode subset of our data to a PGMapDigest
  void encode_digest(const OSDMap& osdmap,
		     ceph::buffer::list& bl, uint64_t features);

  int64_t get_rule_avail(const OSDMap& osdmap, int ruleno) const;
  void get_rules_avail(const OSDMap& osdmap,
		       std::map<int,int64_t> *avail_map) const;
  void dump(ceph::Formatter *f, bool with_net = true) const;
  void dump_basic(ceph::Formatter *f) const;
  void dump_pg_stats(ceph::Formatter *f, bool brief) const;
  void dump_pool_stats(ceph::Formatter *f) const;
  void dump_osd_stats(ceph::Formatter *f, bool with_net = true) const;
  void dump_osd_ping_times(ceph::Formatter *f) const;
  void dump_delta(ceph::Formatter *f) const;
  void dump_filtered_pg_stats(ceph::Formatter *f, std::set<pg_t>& pgs) const;
  void dump_pool_stats_full(const OSDMap &osd_map, std::stringstream *ss,
			    ceph::Formatter *f, bool verbose) const override {
    get_rules_avail(osd_map, &avail_space_by_rule);
    PGMapDigest::dump_pool_stats_full(osd_map, ss, f, verbose);
  }

  /*
  * Dump client io rate, recovery io rate, cache io rate and recovery information.
  * this function is used by "ceph osd pool stats" command
  */
  void dump_pool_stats_and_io_rate(int64_t poolid, const OSDMap &osd_map, ceph::Formatter *f,
				   std::stringstream *ss) const;

  void dump_pg_stats_plain(
    std::ostream& ss,
    const mempool::pgmap::unordered_map<pg_t, pg_stat_t>& pg_stats,
    bool brief) const;
  void get_stuck_stats(
    int types, const utime_t cutoff,
    mempool::pgmap::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const;
  bool get_stuck_counts(const utime_t cutoff, std::map<std::string, int>& note) const;
  void dump_stuck(ceph::Formatter *f, int types, utime_t cutoff) const;
  void dump_stuck_plain(std::ostream& ss, int types, utime_t cutoff) const;
  int dump_stuck_pg_stats(std::stringstream &ds,
			  ceph::Formatter *f,
			  int threshold,
			  std::vector<std::string>& args) const;
  void dump(std::ostream& ss) const;
  void dump_basic(std::ostream& ss) const;
  void dump_pg_stats(std::ostream& ss, bool brief) const;
  void dump_pg_sum_stats(std::ostream& ss, bool header) const;
  void dump_pool_stats(std::ostream& ss, bool header) const;
  void dump_osd_stats(std::ostream& ss) const;
  void dump_osd_sum_stats(std::ostream& ss) const;
  void dump_filtered_pg_stats(std::ostream& ss, std::set<pg_t>& pgs) const;

  void dump_osd_perf_stats(ceph::Formatter *f) const;
  void print_osd_perf_stats(std::ostream *ss) const;

  void dump_osd_blocked_by_stats(ceph::Formatter *f) const;
  void print_osd_blocked_by_stats(std::ostream *ss) const;

  void get_filtered_pg_stats(uint64_t state, int64_t poolid, int64_t osdid,
                             bool primary, std::set<pg_t>& pgs) const;

  void get_health_checks(
    CephContext *cct,
    const OSDMap& osdmap,
    health_check_map_t *checks) const;

  static void generate_test_instances(std::list<PGMap*>& o);
};
WRITE_CLASS_ENCODER_FEATURES(PGMap)

inline std::ostream& operator<<(std::ostream& out, const PGMapDigest& m) {
  m.print_oneline_summary(NULL, &out);
  return out;
}

int process_pg_map_command(
  const std::string& prefix,
  const cmdmap_t& cmdmap,
  const PGMap& pg_map,
  const OSDMap& osdmap,
  ceph::Formatter *f,
  std::stringstream *ss,
  ceph::buffer::list *odata);

class PGMapUpdater
{
public:
  static void check_osd_map(
    CephContext *cct,
    const OSDMap &osdmap,
    const PGMap& pg_map,
    PGMap::Incremental *pending_inc);

  // mark pg's state stale if its acting primary osd is down
  static void check_down_pgs(
      const OSDMap &osd_map,
      const PGMap &pg_map,
      bool check_all,
      const std::set<int>& need_check_down_pg_osds,
      PGMap::Incremental *pending_inc);
};

namespace reweight {
/* Assign a lower weight to overloaded OSDs.
 *
 * The osds that will get a lower weight are those with with a utilization
 * percentage 'oload' percent greater than the average utilization.
 */
  int by_utilization(const OSDMap &osd_map,
		     const PGMap &pg_map,
		     int oload,
		     double max_changef,
		     int max_osds,
		     bool by_pg, const std::set<int64_t> *pools,
		     bool no_increasing,
		     mempool::osdmap::map<int32_t, uint32_t>* new_weights,
		     std::stringstream *ss,
		     std::string *out_str,
		     ceph::Formatter *f);
}

#endif