// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
// vim: ts=8 sw=2 smarttab
/*
 * Ceph - scalable distributed file system
 *
 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
 *
 * This is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License version 2.1, as published by the Free Software 
 * Foundation.  See file COPYING.
 * 
 */

#include "PaxosService.h"
#include "common/Clock.h"
#include "common/config.h"
#include "include/stringify.h"
#include "include/ceph_assert.h"
#include "mon/MonOpRequest.h"

using std::ostream;
using std::string;

using ceph::bufferlist;

#define dout_subsys ceph_subsys_paxos
#undef dout_prefix
#define dout_prefix _prefix(_dout, mon, paxos, service_name, get_first_committed(), get_last_committed())
static ostream& _prefix(std::ostream *_dout, Monitor &mon, Paxos &paxos, string service_name,
			version_t fc, version_t lc) {
  return *_dout << "mon." << mon.name << "@" << mon.rank
		<< "(" << mon.get_state_name()
		<< ").paxosservice(" << service_name << " " << fc << ".." << lc << ") ";
}

bool PaxosService::dispatch(MonOpRequestRef op)
{
  ceph_assert(op->is_type_service() || op->is_type_command());
  auto m = op->get_req<PaxosServiceMessage>();
  op->mark_event("psvc:dispatch");

  dout(10) << __func__ << " " << m << " " << *m
	   << " from " << m->get_orig_source_inst()
	   << " con " << m->get_connection() << dendl;

  if (mon.is_shutdown()) {
    return true;
  }

  // make sure this message isn't forwarded from a previous election epoch
  if (m->rx_election_epoch &&
      m->rx_election_epoch < mon.get_epoch()) {
    dout(10) << " discarding forwarded message from previous election epoch "
	     << m->rx_election_epoch << " < " << mon.get_epoch() << dendl;
    return true;
  }

  // make sure the client is still connected.  note that a proxied
  // connection will be disconnected with a null message; don't drop
  // those.  also ignore loopback (e.g., log) messages.
  if (m->get_connection() &&
      !m->get_connection()->is_connected() &&
      m->get_connection() != mon.con_self &&
      m->get_connection()->get_messenger() != NULL) {
    dout(10) << " discarding message from disconnected client "
	     << m->get_source_inst() << " " << *m << dendl;
    return true;
  }

  // make sure our map is readable and up to date
  if (!is_readable(m->version)) {
    dout(10) << " waiting for paxos -> readable (v" << m->version << ")" << dendl;
    wait_for_readable(op, new C_RetryMessage(this, op), m->version);
    return true;
  }

  // preprocess
  if (preprocess_query(op)) 
    return true;  // easy!

  // leader?
  if (!mon.is_leader()) {
    mon.forward_request_leader(op);
    return true;
  }
  
  // writeable?
  if (!is_writeable()) {
    dout(10) << " waiting for paxos -> writeable" << dendl;
    wait_for_writeable(op, new C_RetryMessage(this, op));
    return true;
  }

  // update
  if (!prepare_update(op)) {
    // no changes made.
    return true;
  }

  if (need_immediate_propose) {
    dout(10) << __func__ << " forced immediate propose" << dendl;
    need_immediate_propose = false;
    propose_pending();
    return true;
  }

  double delay = 0.0;
  if (!should_propose(delay)) {
    dout(10) << " not proposing" << dendl;
    return true;
  }

  if (delay == 0.0) {
    propose_pending();
    return true;
  }

  // delay a bit
  if (!proposal_timer) {
    /**
       * Callback class used to propose the pending value once the proposal_timer
       * fires up.
       */
    auto do_propose = new C_MonContext{&mon, [this](int r) {
        proposal_timer = 0;
        if (r >= 0) {
          propose_pending();
        } else if (r == -ECANCELED || r == -EAGAIN) {
          return;
        } else {
          ceph_abort_msg("bad return value for proposal_timer");
        }
    }};
    dout(10) << " setting proposal_timer " << do_propose
             << " with delay of " << delay << dendl;
    proposal_timer = mon.timer.add_event_after(delay, do_propose);
  } else {
    dout(10) << " proposal_timer already set" << dendl;
  }
  return true;
}

void PaxosService::refresh(bool *need_bootstrap)
{
  // update cached versions
  cached_first_committed = mon.store->get(get_service_name(), first_committed_name);
  cached_last_committed = mon.store->get(get_service_name(), last_committed_name);

  version_t new_format = get_value("format_version");
  if (new_format != format_version) {
    dout(1) << __func__ << " upgraded, format " << format_version << " -> " << new_format << dendl;
    on_upgrade();
  }
  format_version = new_format;

  dout(10) << __func__ << dendl;

  update_from_paxos(need_bootstrap);
}

void PaxosService::post_refresh()
{
  dout(10) << __func__ << dendl;

  post_paxos_update();

  if (mon.is_peon() && !waiting_for_finished_proposal.empty()) {
    finish_contexts(g_ceph_context, waiting_for_finished_proposal, -EAGAIN);
  }
}

bool PaxosService::should_propose(double& delay)
{
  // simple default policy: quick startup, then some damping.
  if (get_last_committed() <= 1) {
    delay = 0.0;
  } else {
    utime_t now = ceph_clock_now();
    if ((now - paxos.last_commit_time) > g_conf()->paxos_propose_interval)
      delay = (double)g_conf()->paxos_min_wait;
    else
      delay = (double)(g_conf()->paxos_propose_interval + paxos.last_commit_time
		       - now);
  }
  return true;
}


void PaxosService::propose_pending()
{
  dout(10) << __func__ << dendl;
  ceph_assert(have_pending);
  ceph_assert(!proposing);
  ceph_assert(mon.is_leader());
  ceph_assert(is_active());

  if (proposal_timer) {
    dout(10) << " canceling proposal_timer " << proposal_timer << dendl;
    mon.timer.cancel_event(proposal_timer);
    proposal_timer = NULL;
  }

  /**
   * @note What we contribute to the pending Paxos transaction is
   *	   obtained by calling a function that must be implemented by
   *	   the class implementing us.  I.e., the function
   *	   encode_pending will be the one responsible to encode
   *	   whatever is pending on the implementation class into a
   *	   bufferlist, so we can then propose that as a value through
   *	   Paxos.
   */
  MonitorDBStore::TransactionRef t = paxos.get_pending_transaction();

  if (should_stash_full())
    encode_full(t);

  encode_pending(t);
  have_pending = false;

  if (format_version > 0) {
    t->put(get_service_name(), "format_version", format_version);
  }

  // apply to paxos
  proposing = true;
  /**
   * Callback class used to mark us as active once a proposal finishes going
   * through Paxos.
   *
   * We should wake people up *only* *after* we inform the service we
   * just went active. And we should wake people up only once we finish
   * going active. This is why we first go active, avoiding to wake up the
   * wrong people at the wrong time, such as waking up a C_RetryMessage
   * before waking up a C_Active, thus ending up without a pending value.
   */
  class C_Committed : public Context {
    PaxosService *ps;
  public:
    explicit C_Committed(PaxosService *p) : ps(p) { }
    void finish(int r) override {
      ps->proposing = false;
      if (r >= 0)
	ps->_active();
      else if (r == -ECANCELED || r == -EAGAIN)
	return;
      else
	ceph_abort_msg("bad return value for C_Committed");
    }
  };
  paxos.queue_pending_finisher(new C_Committed(this));
  paxos.trigger_propose();
}

bool PaxosService::should_stash_full()
{
  version_t latest_full = get_version_latest_full();
  /* @note The first member of the condition is moot and it is here just for
   *	   clarity's sake. The second member would end up returing true
   *	   nonetheless because, in that event,
   *	      latest_full == get_trim_to() == 0.
   */
  return (!latest_full ||
	  (latest_full <= get_trim_to()) ||
	  (get_last_committed() - latest_full > (version_t)g_conf()->paxos_stash_full_interval));
}

void PaxosService::restart()
{
  dout(10) << __func__ << dendl;
  if (proposal_timer) {
    dout(10) << " canceling proposal_timer " << proposal_timer << dendl;
    mon.timer.cancel_event(proposal_timer);
    proposal_timer = 0;
  }

  finish_contexts(g_ceph_context, waiting_for_finished_proposal, -EAGAIN);

  if (have_pending) {
    discard_pending();
    have_pending = false;
  }
  proposing = false;

  on_restart();
}

void PaxosService::election_finished()
{
  dout(10) << __func__ << dendl;

  finish_contexts(g_ceph_context, waiting_for_finished_proposal, -EAGAIN);

  // make sure we update our state
  _active();
}

void PaxosService::_active()
{
  if (is_proposing()) {
    dout(10) << __func__ << " - proposing" << dendl;
    return;
  }
  if (!is_active()) {
    dout(10) << __func__ << " - not active" << dendl;
    /**
     * Callback used to make sure we call the PaxosService::_active function
     * whenever a condition is fulfilled.
     *
     * This is used in multiple situations, from waiting for the Paxos to commit
     * our proposed value, to waiting for the Paxos to become active once an
     * election is finished.
     */
    class C_Active : public Context {
      PaxosService *svc;
    public:
      explicit C_Active(PaxosService *s) : svc(s) {}
      void finish(int r) override {
	if (r >= 0)
	  svc->_active();
      }
    };
    wait_for_active_ctx(new C_Active(this));
    return;
  }
  dout(10) << __func__ << dendl;

  // create pending state?
  if (mon.is_leader()) {
    dout(7) << __func__ << " creating new pending" << dendl;
    if (!have_pending) {
      create_pending();
      have_pending = true;
    }

    if (get_last_committed() == 0) {
      // create initial state
      create_initial();
      propose_pending();
      return;
    }
  } else {
    dout(7) << __func__ << " we are not the leader, hence we propose nothing!" << dendl;
  }

  // wake up anyone who came in while we were proposing.  note that
  // anyone waiting for the previous proposal to commit is no longer
  // on this list; it is on Paxos's.
  finish_contexts(g_ceph_context, waiting_for_finished_proposal, 0);

  if (mon.is_leader())
    upgrade_format();

  // NOTE: it's possible that this will get called twice if we commit
  // an old paxos value.  Implementations should be mindful of that.
  on_active();
}


void PaxosService::shutdown()
{
  cancel_events();

  if (proposal_timer) {
    dout(10) << " canceling proposal_timer " << proposal_timer << dendl;
    mon.timer.cancel_event(proposal_timer);
    proposal_timer = 0;
  }

  finish_contexts(g_ceph_context, waiting_for_finished_proposal, -EAGAIN);

  on_shutdown();
}

void PaxosService::maybe_trim()
{
  if (!is_writeable())
    return;

  const version_t first_committed = get_first_committed();
  version_t trim_to = get_trim_to();
  dout(20) << __func__ << " " << first_committed << "~" << trim_to << dendl;

  if (trim_to < first_committed) {
    dout(10) << __func__ << " trim_to " << trim_to << " < first_committed "
	     << first_committed << dendl;
    return;
  }

  version_t to_remove = trim_to - first_committed;
  const version_t trim_min = g_conf().get_val<version_t>("paxos_service_trim_min");
  if (trim_min > 0 &&
      to_remove < trim_min) {
    dout(10) << __func__ << " trim_to " << trim_to << " would only trim " << to_remove
	     << " < paxos_service_trim_min " << trim_min << dendl;
    return;
  }

  to_remove = [to_remove, trim_to, this] {
    const version_t trim_max = g_conf().get_val<version_t>("paxos_service_trim_max");
    if (trim_max == 0 || to_remove < trim_max) {
      return to_remove;
    }
    if (to_remove < trim_max * 1.5) {
      dout(10) << __func__ << " trim to " << trim_to << " would only trim " << to_remove
             << " > paxos_service_trim_max, limiting to " << trim_max
             << dendl;
      return trim_max;
    }
    const version_t new_trim_max = (trim_max + to_remove) / 2;
    const uint64_t trim_max_multiplier = g_conf().get_val<uint64_t>("paxos_service_trim_max_multiplier");
    if (trim_max_multiplier) {
      return std::min(new_trim_max, trim_max * trim_max_multiplier);
    } else {
      return new_trim_max;
    }
  }();
  trim_to = first_committed + to_remove;

  dout(10) << __func__ << " trimming to " << trim_to << ", " << to_remove << " states" << dendl;
  MonitorDBStore::TransactionRef t = paxos.get_pending_transaction();
  trim(t, first_committed, trim_to);
  put_first_committed(t, trim_to);
  cached_first_committed = trim_to;

  // let the service add any extra stuff
  encode_trim_extra(t, trim_to);

  paxos.trigger_propose();
}

void PaxosService::trim(MonitorDBStore::TransactionRef t,
			version_t from, version_t to)
{
  dout(10) << __func__ << " from " << from << " to " << to << dendl;
  ceph_assert(from != to);

  for (version_t v = from; v < to; ++v) {
    dout(20) << __func__ << " " << v << dendl;
    t->erase(get_service_name(), v);

    string full_key = mon.store->combine_strings("full", v);
    if (mon.store->exists(get_service_name(), full_key)) {
      dout(20) << __func__ << " " << full_key << dendl;
      t->erase(get_service_name(), full_key);
    }
  }
  if (g_conf()->mon_compact_on_trim) {
    dout(20) << " compacting prefix " << get_service_name() << dendl;
    t->compact_range(get_service_name(), stringify(from - 1), stringify(to));
    t->compact_range(get_service_name(),
		     mon.store->combine_strings(full_prefix_name, from - 1),
		     mon.store->combine_strings(full_prefix_name, to));
  }
}

void PaxosService::load_health()
{
  bufferlist bl;
  mon.store->get("health", service_name, bl);
  if (bl.length()) {
    auto p = bl.cbegin();
    using ceph::decode;
    decode(health_checks, p);
  }
}