secondo/Optimizer/Entropy/old/entropy.cpp

/*
entropy.cpp

*/


#include "NLF.h"
#include "BoundConstraint.h"
#include "LinearEquation.h"
#include "OptNIPS.h"
#include "OptCG.h"
#include "OptPDS.h"
#include "OptFDNewton.h"
#include "OptQNewton.h"
#include "OptConstrNewton.h"
#include "entropy.h"

#include <iostream>
#include <iomanip>
#include "satof.h"

#define LOOP( i, v ) for( int i = 1; i <= v.Nrows(); i++ )
#define BIT(j) (1<<(j))
#define EPSILON     1.0e-96
#define TOL_INIT    1.0e-8
#define TOL_SOL     1.0e-6


/*
Since this method is very sensible to the initial point, we are solving two problems:
first we find a viable solution (VS) to be the initial point, then we find the
optimal solution (OS). To find a viable solution we just set a boundary constrained
problem.

Once we've found a viable solution we don't need the boundary constraints for
probabilities since the entropy function is convex and its value is zero in the
valid boundary:

---- E(0) => - 0 ln(0) = 0, E(1) => - 1 ln(1) = 0.
----

Without these constraints the second problem converges faster and more accurattely
because we don't have to use the log penalty function, which introduces round off
errors.

*/

#define BOUND_CONSTRAINTS

using namespace std;
extern bool OptNIPSLike_interpolate;

void init_f_VS(int ndim, ColumnVector& x);
void init_f_OS(int ndim, ColumnVector& x);
void f_VS(int mode, int ndim, const ColumnVector& x, double& fx,
          ColumnVector& gx, SymmetricMatrix& Hx, int& result);
void f_OS(int mode, int ndim, const ColumnVector& x, double& fx,
          ColumnVector& gx, SymmetricMatrix& Hx, int& result);

void print_constraints(Matrix& cteA, ColumnVector& cteB);

BoundConstraint* build_bound_contraints(
      const vector<double>& marginalProbability,
      const vector<pair<int,double> >& jointProbability )
{
  const int nPred = marginalProbability.size(),
            nGiven = jointProbability.size(),
            nVars = 1 << marginalProbability.size();
  ColumnVector lower(nVars), upper(nVars);   // Constraint: lower <= x <= upper

  // Boundary constraints for probability
  LOOP( i, lower ) lower(i) = 0.0;
  LOOP( i, upper ) upper(i) = 1.0;

  return new BoundConstraint(nVars, lower, upper);
}

LinearEquation* build_linear_contraints(
          const vector<double>& marginalProbability,
          const vector<pair<int,double> >& jointProbability )
{
  const int nPred = marginalProbability.size(),
            nGiven = jointProbability.size(),
            nVars = 1 << marginalProbability.size();
  Matrix cteA(nPred+nGiven+1, nVars); // Constraint: cteA . X = cteB
  ColumnVector cteB(nPred+nGiven+1);

  // The sum of all probabilities must be 1
  for(int i = 1; i <= nVars; i++)
    cteA(1,i) = 1.0;
  cteB(1) = 1.0;

  // Constraints for marginal probability
  for( int j = 0; j < nPred; j++ )
  {
    for( int i = 1; i <= nVars; i++ )
      cteA(j+2, i) = (BIT(j) & i)? 1.0 : 0.0;

    cteB(j+2) = marginalProbability[j];
  }

  // Constraints for joint probability
  for( int j = 0; j < nGiven; j++ )
  {
    const int jointPred = jointProbability[j].first;

    for( int i = 1; i <= nVars; i++ )
      cteA(j+nPred+2, i) = ((jointPred & i) == jointPred)? 1.0 : 0.0;

    cteB(j+nPred+2) = jointProbability[j].second;
  }

  return new LinearEquation(cteA, cteB);
}

ostream& operator << (ostream& o, const ColumnVector& v)
{
  LOOP( i, v )
    o << v(i) << endl;

  return o;
}

void print_cond_prob( int npred, const ColumnVector& x )
{
  double p = 0.0;
  cout << x << endl;

  for( int i = 1; i <= (1 << npred); i++ )
    p += x(i);

  cout << "All (must be 1): " << p << endl;

  for( int jp = 1; jp < (1 << npred); jp++ )
  {
    p = 0.0;
    cout << jp << ": ";
    for( int i = 0; i < (1 << npred); i++ )
      if( (jp & i) == jp )
      {
        cout << i << " ";
        p += x(i);
      }

    cout << " => " << p << endl;
  }
}

#ifdef STAND_ALONE

int main( int argc, const char* argv[] )
{
  if( argc == 1 )
  {
    cout << "Computes conditional probability using Maximum Entropy" << endl
         << "Usage: entropy n [p1 p2 p3 ... pn] [cp1 cp2...]" << endl
         << "where n is the number of predicates, (p1..pn) is the probability "
         << "of each predicate and (cp1..cpn) is the joint probability "
         << "assuming the following implicit order:" << endl
         << "cp1 = P(p1 and p2), cp2 = P(p1 and p2 and p3) and so on."
         << endl << endl;

    exit(0);
  }

  int npred = atoi( argv[1] );
  int ngiven = argc - npred - 2;
  int nvars = 1 << npred;

  vector<double> marginalProb;
  vector<pair<int,double> > jointProb;
  vector<pair<int,double> > estimProb;

  for( int i = 0; i < npred; i++ )
    marginalProb.push_back( satof( argv[i+2] ) );

  for( int i = 0, j = 1; i < ngiven; i++ )
  {
    j |= BIT(i+1);
    jointProb.push_back( pair<int,double>( j, satof( argv[i+npred+2] ) ) );
  }

  maximize_entropy(marginalProb, jointProb, estimProb);
}

#endif

 // Auxiliar function that check ranges for marginal probabilities and
 //  sum of all probs
 // assuring feasilbility. For example, if p(A) = 0.6 and p(B) = 0.7, p(A.B)
 // must be in
 // the range [0.3, 0.6]
 void adjust_ranges( const int nVars, ColumnVector& minP, ColumnVector& maxP )
 {
   for( int i = 1; i < nVars; i++ )
     for( int j = 1; j < nVars; j <<= 1 )
       if( (i & j) == 0 )
       {
         if( minP(i) + minP(j) >= 1)
           minP(i|j) = fmax(minP(i|j), minP(i) + minP(j) - 1) ;

         maxP(i|j) = fmin(maxP(i|j), fmin(maxP(i), maxP(j)));
       }
}

// This function ensures that there will be no selectivity equals to 0 or 1
// since in these cases the algorithm does not converge. Since the results
// are approximated, we'll change the intermediate results by a small fraction
// when it happens. Also, we check for feasibility since the marginal
//  probabilities
// are measured in one sample and the joint probabilities in another, which may
// cause strange results.
void adjust_joint_probabilities( const vector<double>& margProb,
                                 vector<pair<int,double> >& jntProb )
{
  const int nVars = 1 << margProb.size();
  ColumnVector minP(nVars), maxP(nVars);
  int njp = 0;

  // Error in input size - just truncate
  if( jntProb.size() >= margProb.size() )
  {
    vector<pair<int,double> > aux = jntProb;

    jntProb.resize(margProb.size() - 1);
    for( int i = 0; i < jntProb.size(); i++ )
      jntProb[i] = aux[i];
  }
  // Generic Ranges
  for( int i = 1; i <= nVars; i++ )
  {
    minP(i) = 0.0 + TOL_INIT;
    maxP(i) = 1.0 - TOL_INIT;
  }

  // Ranges for marginal Probabilities
  for( int i = 0; i < margProb.size(); i++ )
    minP(1 << i) = maxP(1 << i) = margProb[i];

  // Verify feasibility
  adjust_ranges( nVars, minP, maxP );

  // Check each joint probability against ranges and adjust them
  for( int i = 0; i < jntProb.size(); i++ )
  {
    const int node = jntProb[i].first;
    double prob = jntProb[i].second;

    // Change probabilities to agree with ranges
    if( prob < minP(node) + TOL_INIT )
      prob = minP(node) + TOL_INIT;
    else if( prob > maxP(node) - TOL_INIT )
      prob = maxP(node) - TOL_INIT;

    // Define new ranges
    minP(node) = jntProb[i].second = maxP(node) = prob;

    jntProb[i].second = prob;
    adjust_ranges( nVars, minP, maxP );
  }
}

void print_constraints( Matrix& cteA, ColumnVector& cteB )
{
  for( int i = 1; i <= cteA.Nrows(); i++ )
  {
    for( int j = 1; j <= cteA.Ncols(); j++ )
      cout << cteA(i, j) << " ";

    cout << "= " << cteB(i) << endl;
  }
}

// Computes a solution assuming that there is no correlation between predicates
void find_independent_solution( const vector<double>& margProb,
                                ColumnVector& X )
{
  const int nVars = 1 << margProb.size();
  ColumnVector B(nVars);
  Matrix A(nVars, nVars);

  // Compute joint probabilities
  for( int pred = 1; pred < nVars; pred++ )
  {
    double prob = 1.0;

    for( int i = 1; i <= nVars; i++ )
      A(pred, i) = ((pred & i) == pred)? 1.0 : 0.0;

    for( int j = 0; j < margProb.size(); j++ )
      if( (pred & BIT(j)) == BIT(j) )
        prob *= margProb[j];

    B(pred) = prob;
  }

  for( int i = 1; i <= nVars; i++ )
    A(nVars, i) = 1.0;

  B(nVars) = 1.0;

  // Find the solution of the linear equations
  X = A.i() * B;
}

// The algorithm is very sensible to the initial point, so we compute a viable
// solution and use it as initial point
void find_viable_solution( const vector<double>& margProb,
                           const vector<pair<int,double> >& jntProb,
                           TOLS& tols,
                           ColumnVector& x )
{
  const int nVars = 1 << margProb.size();
  CompoundConstraint *constraints =
    new CompoundConstraint(build_bound_contraints(margProb, jntProb),
                           build_linear_contraints(margProb, jntProb));
  NLF2 viableSolutionProblem(nVars, f_VS, init_f_VS, constraints);
  OptNIPS solver(&viableSolutionProblem, tols);

  solver.setOutputFile("output.txt", 0);
  solver.setSearchStrategy(LineSearch);
  solver.setMeritFcn(NormFmu);
//  solver.setDebug();
  solver.optimize();
  solver.printStatus("Viable Solution");
  x = viableSolutionProblem.getXc();
  solver.cleanup();

  for( int i=1; i <= nVars; i++ )
    if( x(i) < EPSILON )
      x(i) = EPSILON;
}


MeritFcn meritFcn;

void find_optimal_solution( const vector<double>& margProb,
                            const vector<pair<int,double> >& jntProb,
                            TOLS& tols,
                            ColumnVector& x )
{
  const int nVars = 1 << margProb.size();
  CompoundConstraint *constraint =
    new CompoundConstraint(build_linear_contraints(margProb, jntProb));
  NLF2 entropyProblem(nVars, f_OS, init_f_OS, constraint);
  OptNIPS solver(&entropyProblem, tols);

  solver.setOutputFile("output.txt", 1);
  solver.setSearchStrategy(LineSearch);
//  solver.setMeritFcn(meritFcn);
  solver.setMeritFcn(NormFmu);
  solver.optimize();
  solver.printStatus("Solution for Entropy");
  x = entropyProblem.getXc();
  solver.cleanup();

  for( int i=1; i <= nVars; i++ )
    if( x(i) < EPSILON )
      x(i) = EPSILON;

  cout << "Entropy: " << - entropyProblem.getF() << endl;

#ifdef STAND_ALONE
  print_cond_prob(margProb.size(), x);
#endif
}

// Computes the combined probabilities from solution
void compute_probabilities(const ColumnVector &x,
                           vector<pair<int,double> >& estimatedProbability)
{
  estimatedProbability.clear();

  for( int jp = 1; jp < x.Nrows(); jp++ )
  {
    double p = 0.0;

    for( int i = 0; i < x.Nrows(); i++ )
      if( (jp & i) == jp )
        p += x(i);

    estimatedProbability.push_back(pair<int,double>(jp, p));
  }
}

// Some tests are done to ensure that this is a valid solution regardless of the
// convergence or not of the problem.
void verify_solution( const ColumnVector &x,
                      const vector<pair<int,double> >& jointProbability,
                      const vector<pair<int,double> >& estimatedProbability)
{
  double sum = 0.0;

  // Check for negative probabilities - a small tolerance is allowed
  for( int i=1; i <= x.Nrows(); i++ )
    if( x(i) < - TOL_SOL )
      throw 0;

  // Check the sum of probabilities
  for( int i=1; i <= x.Nrows(); i++ )
    sum += x(i);

  if( fabs( sum - 1 ) > TOL_SOL )
    throw 0;

  // Check for negative joint probabilities - no tolerance
  for( int i=0; i < estimatedProbability.size(); i++ )
    if( estimatedProbability[i].second < 0 )
      throw 0;

  // Check if the solution is close to measured joint probabilities
  for( int i=0, j=0;
       i < estimatedProbability.size() && j < jointProbability.size(); i++ )
    if( jointProbability[j].first == estimatedProbability[i].first )
    {
      if( fabs(
        jointProbability[j].second - estimatedProbability[i].second )
          > TOL_SOL )
        throw 0;

      j++;
    }
}

// These varibles are used to bypass a flaw int the design of the library:
// we need
// more parameters in the init_f function.
static vector<double>* ptrMargProb;
static vector<pair<int,double> >* ptrJointProb;
static ColumnVector* ptrInitPoint;

void maximize_entropy( vector<double>& marginalProbability,
                       vector<pair<int,double> >& jointProbability,
                       vector<pair<int,double> >& estimatedProbability )
{
  TOLS tols;
  const int nVars = 1 << marginalProbability.size(),
            maxIter = 10000 / nVars;
  ColumnVector initPoint(nVars),
               x(nVars);

  ptrMargProb = &marginalProbability;
  ptrJointProb= &jointProbability;
  ptrInitPoint = &initPoint;

  estimatedProbability.clear();

  // Sets tolerance values
  tols.setDefaultTol();
  tols.setMinStep(1e-16);
  tols.setStepTol(1e-12);
  tols.setFTol(1e-9);
  tols.setCTol(1e-9);
  tols.setGTol(1e-9);
  tols.setLSTol(1e-8);
  tols.setMaxIter(maxIter);
  tols.setMaxBacktrackIter(maxIter);
  tols.setMaxFeval(maxIter);

  // Library flaw
  OptNIPSLike_interpolate = true;

  try
  {
    // Some adjustments are made to ensure convergence when we have any
    // selectivity = 1
    adjust_joint_probabilities(marginalProbability, jointProbability);

    if( marginalProbability.size() > 2 && jointProbability.size() > 0 )
    {
      try
      {
        // First optimization problem
        find_viable_solution(marginalProbability, jointProbability,
                             tols, initPoint);
        compute_probabilities(initPoint, estimatedProbability);
        verify_solution(initPoint, jointProbability, estimatedProbability);
        meritFcn = ArgaezTapia;
      }
      catch(...)
      { // If some error occurs, try using a neutral approach for init point
        cout << "Viable point search failed..." << endl;
        for( int i = 1; i <= nVars; i++ )
          initPoint(i) = 1.0;

        meritFcn = NormFmu;
      }

      // Second optimization problem
      find_optimal_solution(marginalProbability, jointProbability, tols, x);
      compute_probabilities(x, estimatedProbability);
      verify_solution(x, jointProbability, estimatedProbability);
    }
    else // In these cases we don't need to use a numerical method
      if( marginalProbability.size() > 1 && jointProbability.size() == 0 )
      {
        find_independent_solution(marginalProbability, x );
        compute_probabilities(x, estimatedProbability);
      }
      else if( marginalProbability.size() == 2 && jointProbability.size() == 1 )
      {
        estimatedProbability.push_back(pair<int,double>(1,
                                marginalProbability[0]));
        estimatedProbability.push_back(pair<int,double>(2,
                                 marginalProbability[1]));
        estimatedProbability.push_back(jointProbability[0]);
      }
      else // marginalProbability.size() == 1 && jointProbability.size() == 0
      {
        estimatedProbability.push_back(pair<int,double>(1,
                                 marginalProbability[0]));
      }
    cout << "Entropy approach converged!" << endl;
  }
  catch(...)
  {
    // If any problem occurs, use the "independence of predicates" approach.
    find_independent_solution(marginalProbability, x );
    compute_probabilities(x, estimatedProbability);

    cout << "Unable to use the entropy approach!" << endl;
  }
}

void init_f_VS(int ndim, ColumnVector& x)
{
  double val = 1.0;

  // Adjust initial values for first problem
  if( ndim > 3 && ptrJointProb->size() > 1 )
  {
    double v1 = (*ptrJointProb)[ptrJointProb->size()-1].second,
           v2 = (*ptrJointProb)[ptrJointProb->size()-2].second - v1;

    val = (val - v1 - v2) / (ndim - 2);
    for( int i = 1; i <= ndim; i++ )
      x(i) = val;

    x(ndim-1) = v1;
    x(ndim/2 - 1) = v2;
  }
  else
    for( int i = 1; i <= ndim; i++ )
      x(i) = 1.0 / ndim;

#ifdef STAND_ALONE
  cout << "Initial point: " << endl << x << endl;
#endif
}

// ptrInitPoint holds the solution of first optimization problem
void init_f_OS(int, ColumnVector& x)
{
  x = *ptrInitPoint;

#ifdef STAND_ALONE
  cout << "Viable point: " << endl << x << endl;
#endif
}

// Some adjustments to make the entropy function continuous and
// differentiable at 0.
void f_OS(int mode, int ndim, const ColumnVector& x, double& fx,
          ColumnVector& gx, SymmetricMatrix& hx, int& result)
{
  if( mode & NLPFunction )
  {
    double val = 0.0;

    LOOP( i, x )
    {
      const double xi = x(i);

      if( xi > EPSILON )
        val += xi * log(xi);
      else
        val += xi * log(EPSILON);
    }

    fx = val;
    result = NLPFunction;
  }

  if( mode & NLPGradient )
  {
    LOOP( i, x )
    {
      const double xi = x(i);

      if( xi > EPSILON )
        gx(i) = log(xi) + 1;
      else
        gx(i) = log(EPSILON);
    }

    result = NLPGradient;
  }

  if( mode & NLPHessian )
  {
    LOOP( i, x )
    {
      const double xi = x(i);

      if( xi > EPSILON )
        hx(i,i) = 1.0/xi;
      else
        hx(i,i) = 0;
    }

    result = NLPHessian;
  }
}

// Very simple function to get consistent probabilities for start point
void f_VS(int mode, int ndim, const ColumnVector& x, double& fx,
       ColumnVector& gx, SymmetricMatrix& hx,
       int& result)
{
  if( mode & NLPFunction )
  {
    double val = 0.0;

    LOOP( i, x ) val += - x(i);

    fx = val;
    result = NLPFunction;
  }

  if( mode & NLPGradient )
  {
    LOOP( i, x ) gx(i) = - 1.0;

    result = NLPGradient;
  }

  if( mode & NLPHessian )
  {
    LOOP( i, x ) hx(i,i) = 0.0;

    result = NLPHessian;
  }
}