Files
secondo/Algebras/ImageSimilarity/hamerly_kmeans.cpp
2026-01-23 17:03:45 +08:00

193 lines
6.6 KiB
C++

/*
----
This file is NOT part of SECONDO.
Authors: Greg Hamerly and Jonathan Drake
Feedback: hamerly@cs.baylor.edu
See: http://cs.baylor.edu/~hamerly/software/kmeans.php
Copyright 2014
----
//paragraph [1] Title: [{\Large \bf \begin{center}] [\end{center}}]
//[TOC] [\tableofcontents]
[1] Implementation of the Hamerly's kmeans algorithm
1 Implementation of the Hamerly's kmeans algorithm
*/
/* Authors: Greg Hamerly and Jonathan Drake
* Feedback: hamerly@cs.baylor.edu
* See: http://cs.baylor.edu/~hamerly/software/kmeans.php
* Copyright 2014
*/
#include "hamerly_kmeans.h"
#include "general_functions.h"
#include <cmath>
/* Hamerly's algorithm that is a 'simplification' of Elkan's, in that it keeps
* the following bounds:
* - One upper bound per clustered record on the distance between the record
* and its closest center. It is always greater than or equal to the true
* distance between the record and its closest center. This is the same as in
* Elkan's algorithm.
* - *One* lower bound per clustered record on the distance between the record
* and its *second*-closest center. It is always less than or equal to the
* true distance between the record and its second closest center. This is
* different information than Elkan's algorithm -- his algorithm keeps k
* lower bounds for each record, for a total of (n*k) lower bounds.
*
* The basic ideas are:
* - when lower(x) <= upper(x), we need to recalculate the closest centers for
* the record x, and reset lower(x) and upper(x) to their boundary values
* - whenever a center moves
* - calculate the distance it moves 'd'
* - for each record x assigned to that center, update its upper bound
* - upper(x) = upper(x) + d
* - after each iteration
* - find the center that has moved the most (with distance 'd')
* - update the lower bound for all (?) records:
* - lower(x) = lower(x) - d
*
* Parameters: none
*
* Return value: the number of iterations performed (always at least 1)
*/
// this version only updates center locations when necessary
int HamerlyKmeans::runThread(int threadId, int maxIterations) {
int iterations = 0;
int startNdx = start(threadId);
int endNdx = end(threadId);
while ((iterations < maxIterations) && ! converged) {
++iterations;
// compute the inter-center distances, keeping only the closest
//distances
update_s(threadId);
synchronizeAllThreads();
// loop over all records
for (int i = startNdx; i < endNdx; ++i) {
unsigned short closest = assignment[i];
// if upper[i] is less than the greater of these two, then we can
// ignore record i
double upper_comparison_bound = std::max(s[closest],
lower[i]);
// first check: if u(x) <= s(c(x)) or u(x) <= lower(x), then ignore
// x, because its closest center must still be closest
if (upper[i] <= upper_comparison_bound) {
continue;
}
// otherwise, compute the real distance between this record and its
// closest center, and update upper
double u2 = pointCenterDist2(i, closest);
upper[i] = sqrt(u2);
// if (u(x) <= s(c(x))) or (u(x) <= lower(x)), then ignore x
if (upper[i] <= upper_comparison_bound) {
continue;
}
// now update the lower bound by looking at all other centers
double l2 = std::numeric_limits<double>::max();
// the squared lower bound
for (int j = 0; j < k; ++j) {
if (j == closest) { continue; }
double dist2 = pointCenterDist2(i, j);
if (dist2 < u2) {
// another center is closer than the current assignment
// change the lower bound to be the current upper bound
// (since the current upper bound is the distance to the
// now-second-closest known center)
l2 = u2;
// adjust the upper bound and the current assignment
u2 = dist2;
closest = j;
} else if (dist2 < l2) {
// we must reduce the lower bound on the distance to the
// *second* closest center to x[i]
l2 = dist2;
}
}
// we have been dealing in squared distances; need to convert
lower[i] = sqrt(l2);
// if the assignment for i has changed, then adjust the counts and
// locations of each center's accumulated mass
if (assignment[i] != closest) {
upper[i] = sqrt(u2);
changeAssignment(i, closest, threadId);
}
}
verifyAssignment(iterations, startNdx, endNdx);
// ELKAN 4, 5, AND 6
// calculate the new center locations
synchronizeAllThreads();
if (threadId == 0) {
int furthestMovingCenter = move_centers();
converged = (0.0 == centerMovement[furthestMovingCenter]);
}
synchronizeAllThreads();
if (! converged) {
update_bounds(startNdx, endNdx);
}
synchronizeAllThreads();
}
return iterations;
}
/* This method does the following:
* - finds the furthest-moving center
* - finds the distances moved by the two furthest-moving centers
* - updates the upper/lower bounds for each record
*
* Parameters: none
*/
void HamerlyKmeans::update_bounds(int startNdx, int endNdx) {
int furthestMovingCenter = 0;
double longest = centerMovement[furthestMovingCenter];
double secondLongest = 0.0;
for (int j = 0; j < k; ++j) {
if (longest < centerMovement[j]) {
secondLongest = longest;
longest = centerMovement[j];
furthestMovingCenter = j;
} else if (secondLongest < centerMovement[j]) {
secondLongest = centerMovement[j];
}
}
// update upper/lower bounds
for (int i = startNdx; i < endNdx; ++i) {
// the upper bound increases by the amount that its center moved
upper[i] += centerMovement[assignment[i]];
// The lower bound decreases by the maximum amount that any center
// moved, unless the furthest-moving center is the one it's assigned
// to. In the latter case, the lower bound decreases by the amount
// of the second-furthest-moving center.
lower[i] -= (assignment[i] == furthestMovingCenter)
? secondLongest : longest;
}
}