/* ---- This file is NOT part of SECONDO. * Authors: Greg Hamerly and Jonathan Drake * Feedback: hamerly@cs.baylor.edu * See: http://cs.baylor.edu/~hamerly/software/kmeans.php * Copyright 2014 * * Kmeans is an abstract base class for algorithms which implement Lloyd's * k-means algorithm. Subclasses provide functionality in the "runThread()" * method. ---- //paragraph [1] Title: [{\Large \bf \begin{center}] [\end{center}}] //[TOC] [\tableofcontents] [1] Declarations for the k-means algorithm 1 Declarations for the k-means algorithm */ #ifndef KMEANS_H #define KMEANS_H /* Authors: Greg Hamerly and Jonathan Drake * Feedback: hamerly@cs.baylor.edu * See: http://cs.baylor.edu/~hamerly/software/kmeans.php * Copyright 2014 * * Kmeans is an abstract base class for algorithms which implement Lloyd's * k-means algorithm. Subclasses provide functionality in the "runThread()" * method. */ #include "dataset.h" #include #include #ifdef USE_THREADS #include #endif class Kmeans { public: // Construct a K-means object to operate on the given dataset Kmeans(); virtual ~Kmeans() { free(); } // This method kicks off the threads that do the clustering and run // until convergence (or until reaching maxIterations). It returns the // number of iterations performed. int run(int aMaxIterations = std::numeric_limits::max()); // Get the cluster assignment for the given point index. int getAssignment(int xIndex) const { return assignment[xIndex]; } // Initialize the algorithm at the beginning of the run(), with the // given data and initial assignment. The parameter initialAssignment // will be modified by this algorithm and will at the end contain the // final assignment of clusters. virtual void initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads); // Free all memory being used by the object. virtual void free(); // This method verifies that the current assignment is correct, by // checking every point-center distance. For debugging. virtual void verifyAssignment(int iteration, int startNdx, int endNdx) const; // Compute the sum of squared errors for the data on the centers (not // designed to be fast). virtual double getSSE() const; // Get the name of this clustering algorithm (to be overridden by // subclasses). virtual std::string getName() const = 0; // Virtual methods for computing inner products (depending on the kernel // being used, e.g.). For vanilla k-means these will be the standard dot // product; for more exotic applications these will be other kernel // functions. virtual double pointPointInnerProduct(int x1, int x2) const = 0; virtual double pointCenterInnerProduct(int xndx, unsigned short cndx) const = 0; virtual double centerCenterInnerProduct(unsigned short c1, unsigned short c2) const = 0; // Use the inner products to compute squared distances between a point // and center. virtual double pointCenterDist2(int x1, unsigned short cndx) const { #ifdef COUNT_DISTANCES ++numDistances; #endif return pointPointInnerProduct(x1, x1) - 2 * pointCenterInnerProduct(x1, cndx) + centerCenterInnerProduct(cndx, cndx); } // Use the inner products to compute squared distances between two // centers. virtual double centerCenterDist2(unsigned short c1, unsigned short c2) const { #ifdef COUNT_DISTANCES ++numDistances; #endif return centerCenterInnerProduct(c1, c1) - 2 * centerCenterInnerProduct(c1, c2) + centerCenterInnerProduct(c2, c2); } #ifdef COUNT_DISTANCES #ifdef USE_THREADS // Note: numDistances is NOT thread-safe, but it is not meant to be // enabled in performant code. #error Counting distances and utilizing multiple thrds is not supported. #endif mutable long long numDistances; #endif protected: // The dataset to cluster. const Dataset *x; // Local copies for convenience. int n, k, d; // Pthread primitives for multithreading. int numThreads; #ifdef USE_THREADS pthread_barrier_t barrier; #endif // To communicate (to all threads) that we have converged. bool converged; // Keep track of how many points are in each cluster, divided over each // thread. int **clusterSize; // centerMovement is computed in move_centers() and used to detect // convergence (if max(centerMovement) == 0.0) and update point-center // distance bounds (in subclasses that use them). double *centerMovement; // For each point in x, keep which cluster it is assigned to. By using a // short, we assume a limited number of clusters (fewer than 2^16). unsigned short *assignment; // This is where each thread does its work. virtual int runThread(int threadId, int maxIterations) = 0; // Static entry method for pthread_create(). static void *runner(void *args); // Assign point at xIndex to cluster newCluster, //working within thread threadId. virtual void changeAssignment(int xIndex, int newCluster, int threadId); // Over what range in [0, n) does this thread have ownership of the // points? end() returns one past the last owned point. int start(int threadId) const { return n * threadId / numThreads; } int end(int threadId) const { return start(threadId + 1); } int whichThread(int index) const { return index * numThreads / n; } // Convenience method for causing all threads to synchronize. void synchronizeAllThreads() { #ifdef USE_THREADS pthread_barrier_wait(&barrier); #endif } }; #endif