secondo/Algebras/AssociationAnalysis/Apriori.cpp

/*
----
This file is part of SECONDO.

Copyright (C) 2021, University in Hagen, Department of Computer Science,
Database Systems for New Applications.

SECONDO is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

SECONDO is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with SECONDO; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
----

//paragraph [1] Title: [{\Large \bf \begin {center}] [\end {center}}]

[1] Association Analysis Algebra Implementation

January 2021 - April 2021, P. Fedorow for bachelor thesis.

*/

#include "Apriori.h"

#include "Common.h"

#include "StandardTypes.h"

#include <iterator>
#include <memory>

namespace AssociationAnalysis {

enum Deoptimize {
  NoTransactionBitmap = 1 << 0,
  NoHashTree = 1 << 1,
  NoPruning = 1 << 2,
  NoTriangularMatrix = 1 << 3,
};

// Implementation of a bitmap which represents an itemset. Each bit corresponds
// to an item.
class ItemsetBitmap {
public:
  // Inserts the given item into the itemset.
  void insert(unsigned long item) {
    unsigned long index = ItemsetBitmap::index(item);
    if (index >= this->bitmap.size()) {
      this->bitmap.resize(index + 1);
    }
    this->bitmap[index] |= ItemsetBitmap::mask(item);
  }

  // Returns true if the itemset contains the the given item.
  [[nodiscard]] bool contains(unsigned long item) const {
    unsigned long index = ItemsetBitmap::index(item);
    if (index < this->bitmap.size()) {
      return this->bitmap[index] & ItemsetBitmap::mask(item);
    } else {
      return false;
    }
  }

  // Resets the itemset but keeps the memory allocated to allow efficient reuse.
  void reset() {
    for (unsigned long &chunk : this->bitmap) {
      chunk = 0;
    }
  }

private:
  // Each item corresponds to a bit in the bitmap. Item k is stored in the k-th
  // bit (counting up from the least significant bit).
  std::vector<unsigned long> bitmap;

  // Computes the index into the bitmap vector for the given item.
  static unsigned long index(unsigned long item) {
    return item / (sizeof(unsigned long) * 8);
  }

  // Computes the mask for the given item.
  static unsigned long mask(unsigned long item) {
    return 1ul << (item % (sizeof(unsigned long) * 8));
  }
};

// Inserts the itemset into the hash tree. All inserted itemsets have to be of
// the same size.
void ItemsetHashTree::insert(const std::vector<int> &itemset) {
  this->insert(itemset, 0, 0);
}

// Returns all itemsets that potentially can be part of the given transaction.
std::vector<std::reference_wrapper<const std::vector<int>>>
ItemsetHashTree::subset(const collection::IntSet *transaction) const {
  std::vector<std::reference_wrapper<const std::vector<int>>> itemsets;
  this->subset(transaction, 0, 0, itemsets);
  return itemsets;
}

// Returns true if the hash-tree is empty.
bool ItemsetHashTree::empty() const {
  // The hash-tree starts with an empty leaf node.
  return this->nodes[0].isLeaf && this->nodes[0].itemsets.empty();
}

// Inserts the given itemset into the given node.
//
// If the node is not a leaf the function will recurse down the tree until a
// leaf is reached and the insertion can happen. When recursion occurs the depth
// parameter is incremented, so the depth parameter keeps track of the depth
// at which the node is located in the hash-tree.
void ItemsetHashTree::insert(const std::vector<int> &itemset, size_t node,
                             size_t depth) {
  // The hash-tree can only grow to a depth of itemset size + 1.
  assert(depth <= itemset.size());

  if (!this->nodes[node].isLeaf) {
    // The current node is not a leaf so we can't insert the itemset here. We
    // have to insert the itemset into one of its children. At depth k of the
    // hash-tree we hash the itemsets k-th item to determine the child.
    size_t child = itemset[depth] % CHILDREN_NUM;
    this->insert(itemset, this->nodes[node].children[child], depth + 1);
  } else {
    // The current node is a leaf so we can insert the itemset here.
    this->nodes[node].itemsets.push_back(itemset);

    // If the itemset capacity of the node (specified by ITEMSET_THRESHOLD) is
    // reached we break the node up by moving the contained itemsets into new
    // children-nodes.
    //
    // If the hash-tree reached its maximal depth (itemset size + 1) at the
    // current node no further break-up can occur.
    if (this->nodes[node].itemsets.size() == ITEMSET_THRESHOLD &&
        depth < itemset.size()) {

      // Create new nodes and mark them as children of the current node.
      this->nodes[node].children.resize(CHILDREN_NUM);
      this->nodes.resize(this->nodes.size() + CHILDREN_NUM);
      for (size_t i = 0; i < CHILDREN_NUM; i += 1) {
        this->nodes[node].children[i] = this->nodes.size() - CHILDREN_NUM + i;
      }

      // Before we insert the itemsets into their new nodes, we must safe the
      // itemsets vector. This is necessary to prevent invalidation problems
      // (the nodes vector can grow while insertion and trigger a reallocation)
      // while we iterate over the itemsets vector. We safe the itemsets vector
      // by moving it into a local variable.
      std::vector<std::vector<int>> itemsets(move(this->nodes[node].itemsets));

      // Insert the itemsets into their new homes.
      for (const std::vector<int> &itemset : itemsets) {
        // As we are depth k of the hash-tree we hash the itemsets k-th item
        // to determine the child-node for the itemset.
        size_t child = itemset[depth] % CHILDREN_NUM;
        this->insert(itemset, this->nodes[node].children[child], depth + 1);
      }

      // This node is no longer a leaf now.
      this->nodes[node].isLeaf = false;
    }
  }
}

// Collects all itemsets that potentially can be part of the given transaction.
//
// To do this the hash-tree is traversed top-down and each child that might
// contain an itemset that is part of the given transaction is visited.
void ItemsetHashTree::subset(
    const collection::IntSet *transaction, size_t node, size_t depth,
    std::vector<std::reference_wrapper<const std::vector<int>>> &itemsets)
    const {
  if (this->nodes[node].isLeaf) {
    // The current node is a leaf collect all contained itemsets.
    for (const std::vector<int> &itemset : this->nodes[node].itemsets) {
      itemsets.emplace_back(itemset);
    }
  } else {
    // The current node is not a leaf so we have to visit all children that
    // might contain an itemset that is part of the given transaction.

    // The depth at which we are in the hash-tree determines how many items of
    // the transaction we already handled and therefore can ignore. If we are
    // the depth k we can ignore the first k items of the transaction and have
    // to visit all children that could contain an itemset with one of the
    // transactions remaining items. So we hash the remaining items to figure
    // out which children to visit specifically.
    std::vector<bool> visit(CHILDREN_NUM, false);
    for (size_t i = depth; i < transaction->getSize(); i += 1) {
      visit[transaction->get(i) % CHILDREN_NUM] = true;
    }

    // Visit the children to collect itemsets that potentially can be part of
    // the given transaction.
    for (size_t child = 0; child < CHILDREN_NUM; child += 1) {
      if (visit[child]) {
        this->subset(transaction, this->nodes[node].children[child], depth + 1,
                     itemsets);
      }
    }
  }
}

// Generates candidates of size k+1 given frequent itemsets of size k.
template <class Candidates>
Candidates genCandidates(const std::set<std::vector<int>> &prevFrequentItemsets,
                         bool noPruning) {
  if (noPruning) {
    Candidates candidates;
    for (const std::vector<int> &itemset1 : prevFrequentItemsets) {
      for (const std::vector<int> &itemset2 : prevFrequentItemsets) {
        if (&itemset1 == &itemset2) {
          continue;
        }
        auto last1 = --itemset1.cend();
        auto last2 = --itemset2.cend();
        if (*last1 < *last2) {
          auto it1 = itemset1.cbegin();
          auto it2 = itemset2.cbegin();
          bool match = true;
          while (it1 != last1 && it2 != last2) {
            if (*it1 != *it2) {
              match = false;
              break;
            }
            it1++;
            it2++;
          }
          if (match) {
            std::vector<int> itemset;
            itemset.reserve(itemset1.size() + 1);
            auto it = itemset1.cbegin();
            while (it != itemset1.cend() && *it < *last2) {
              it++;
            }
            itemset.insert(itemset.cend(), itemset1.cbegin(), it);
            itemset.push_back(*last2);
            itemset.insert(itemset.cend(), it, itemset1.cend());
            if constexpr (std::is_same<Candidates, ItemsetHashTree>::value) {
              candidates.insert(itemset);
            } else {
              candidates.push_back(itemset);
            }
          }
        }
      }
    }
    return candidates;
  } else {
    std::vector<std::vector<int>> potentialCandidates;
    // join step
    for (const std::vector<int> &itemset1 : prevFrequentItemsets) {
      for (const std::vector<int> &itemset2 : prevFrequentItemsets) {
        if (&itemset1 == &itemset2) {
          continue;
        }
        auto last1 = --itemset1.cend();
        auto last2 = --itemset2.cend();
        if (*last1 < *last2) {
          auto it1 = itemset1.cbegin();
          auto it2 = itemset2.cbegin();
          bool match = true;
          while (it1 != last1 && it2 != last2) {
            if (*it1 != *it2) {
              match = false;
              break;
            }
            it1++;
            it2++;
          }
          if (match) {
            std::vector<int> itemset;
            itemset.reserve(itemset1.size() + 1);
            auto it = itemset1.cbegin();
            while (it != itemset1.cend() && *it < *last2) {
              it++;
            }
            itemset.insert(itemset.cend(), itemset1.cbegin(), it);
            itemset.push_back(*last2);
            itemset.insert(itemset.cend(), it, itemset1.cend());
            potentialCandidates.push_back(itemset);
          }
        }
      }
    }
    // prune step
    Candidates candidates;
    for (const std::vector<int> &candidate : potentialCandidates) {
      bool prune = false;
      for (size_t i = 0; i < candidate.size(); i += 1) {
        std::vector<int> subset(candidate.size() - 1);
        for (size_t j = 0; j < subset.size(); j += 1) {
          if (j < i) {
            subset[j] = candidate[j];
          } else {
            subset[j] = candidate[j + 1];
          }
        }
        if (prevFrequentItemsets.count(subset) == 0) {
          prune = true;
          break;
        }
      }
      if (!prune) {
        if constexpr (std::is_same<Candidates, ItemsetHashTree>::value) {
          candidates.insert(candidate);
        } else {
          candidates.push_back(candidate);
        }
      }
    }
    return candidates;
  }
}

// Finds all frequent itemsets that satisfy the support given by minSupport.
// The itemset of a transaction is extracted from each tuple of the relation
// by an index given by itemsetAttr.
aprioriLI::aprioriLI(GenericRelation *relation, int minSupport, int itemsetAttr,
                     int deoptimize) {
  int transactionCount = relation->GetNoTuples();

  int k = 0;
  std::vector<std::set<std::vector<int>>> prevFrequentItemsets;
  prevFrequentItemsets.resize(2);

  if (deoptimize & Deoptimize::NoTriangularMatrix) {
    // Generate the frequent 1-Itemsets.
    // Count how many transactions contain any given item.
    std::unique_ptr<GenericRelationIterator> rit(relation->MakeScan());
    Tuple *t;
    std::unordered_map<int, int> counts;
    while ((t = rit->GetNextTuple()) != nullptr) {
      auto transaction = (collection::IntSet *)t->GetAttribute(itemsetAttr);
      for (int i = 0; i < (int)transaction->getSize(); i += 1) {
        counts[transaction->get(i)] += 1;
      }
      t->DeleteIfAllowed();
    }
    // Add any item as an 1-itemset to the frequent itemsets if it satisfies
    // the minimum support.
    for (auto const &[item, count] : counts) {
      if (count >= minSupport) {
        std::vector<int> itemset = {item};
        prevFrequentItemsets[1].insert(itemset);
        double support = (double)count / (double)transactionCount;
        this->frequentItemsets.emplace_back(itemset, support);
      }
    }
    k = 2;
  } else {
    // Generate the frequent 1-Itemsets and 2-Itemsets.
    // Count how many transactions contain any given item and any given item
    // pair.
    TriangularMatrix triangularMatrix;
    std::unique_ptr<GenericRelationIterator> rit(relation->MakeScan());
    Tuple *t;
    std::unordered_map<int, int> counts;
    while ((t = rit->GetNextTuple()) != nullptr) {
      auto transaction = (collection::IntSet *)t->GetAttribute(itemsetAttr);
      for (int i = 0; i < (int)transaction->getSize(); i += 1) {
        counts[transaction->get(i)] += 1;
        // Insert all 2-itemsets that are part of the transaction into the
        // triangular matrix.
        for (size_t j = i + 1; j < transaction->getSize(); j += 1) {
          triangularMatrix.insert(transaction->get(i), transaction->get(j));
        }
      }
      t->DeleteIfAllowed();
    }
    // Add any item as an 1-itemset to the frequent itemsets if it satisfies
    // the minimum support.
    for (auto const &[item, count] : counts) {
      if (count >= minSupport) {
        std::vector<int> itemset = {item};
        prevFrequentItemsets[1].insert(itemset);
        double support = (double)count / (double)transactionCount;
        this->frequentItemsets.emplace_back(itemset, support);
      }
    }
    // Find the frequent 2-itemsets by pairing up frequent items and checking
    // their support by consulting the triangular matrix.
    prevFrequentItemsets.resize(3);
    auto cend = prevFrequentItemsets[1].cend();
    for (auto it1 = prevFrequentItemsets[1].cbegin(); it1 != cend; it1++) {
      for (auto it2 = std::next(it1); it2 != cend; it2++) {
        int item1 = (*it1)[0];
        int item2 = (*it2)[0];
        if (triangularMatrix.count(item1, item2) >= minSupport) {
          double support = (double)triangularMatrix.count(item1, item2) /
                           (double)transactionCount;
          std::vector<int> itemset(
              {std::min(item1, item2), std::max(item1, item2)});
          prevFrequentItemsets[2].insert(itemset);
          this->frequentItemsets.emplace_back(itemset, support);
        }
      }
    }
    k = 3;
  }

  if ((deoptimize & Deoptimize::NoHashTree) &&
      (deoptimize & Deoptimize::NoTransactionBitmap)) {
    for (int size = k; !prevFrequentItemsets[size - 1].empty(); size += 1) {
      auto candidates = genCandidates<std::vector<std::vector<int>>>(
          prevFrequentItemsets[size - 1], deoptimize & Deoptimize::NoPruning);
      if (candidates.empty()) {
        break;
      }
      // Count how many transactions contain any given candidate.
      std::unique_ptr<GenericRelationIterator> rit(relation->MakeScan());
      Tuple *t;
      std::map<std::vector<int>, int> counts;
      while ((t = rit->GetNextTuple()) != nullptr) {
        auto transaction = (collection::IntSet *)t->GetAttribute(itemsetAttr);
        for (const std::vector<int> &candidate : candidates) {
          bool containsCandidate = true;
          for (int item : candidate) {
            // Check if the item is contained by binary search.
            if (!transaction->contains(item)) {
              containsCandidate = false;
              break;
            }
          }
          if (containsCandidate) {
            counts[candidate] += 1;
          }
        }
        t->DeleteIfAllowed();
      }
      // Add any candidate to the frequent itemsets if it satisfies the
      // minimum support.
      prevFrequentItemsets.resize(size + 1);
      for (auto const &[itemset, count] : counts) {
        if (count >= minSupport) {
          prevFrequentItemsets[size].insert(itemset);
          double support = (double)count / (double)transactionCount;
          this->frequentItemsets.emplace_back(itemset, support);
        }
      }
    }
  } else if ((deoptimize & Deoptimize::NoHashTree) &&
             !(deoptimize & Deoptimize::NoTransactionBitmap)) {
    ItemsetBitmap transactionBitmap;
    for (int size = k; !prevFrequentItemsets[size - 1].empty(); size += 1) {
      auto candidates = genCandidates<std::vector<std::vector<int>>>(
          prevFrequentItemsets[size - 1], deoptimize & Deoptimize::NoPruning);
      if (candidates.empty()) {
        break;
      }
      // Count how many transactions contain any given candidate.
      std::unique_ptr<GenericRelationIterator> rit(relation->MakeScan());
      Tuple *t;
      std::map<std::vector<int>, int> counts;
      while ((t = rit->GetNextTuple()) != nullptr) {
        auto transaction = (collection::IntSet *)t->GetAttribute(itemsetAttr);
        transactionBitmap.reset();
        for (size_t i = 0; i < transaction->getSize(); i += 1) {
          transactionBitmap.insert(transaction->get(i));
        }
        for (const std::vector<int> &candidate : candidates) {
          bool containsCandidate = true;
          for (int item : candidate) {
            // Check if the item is contained in the transaction by a bitmap
            // lookup.
            if (!transactionBitmap.contains(item)) {
              containsCandidate = false;
              break;
            }
          }
          if (containsCandidate) {
            counts[candidate] += 1;
          }
        }
        t->DeleteIfAllowed();
      }
      // Add any candidate to the frequent itemsets if it satisfies the
      // minimum support.
      prevFrequentItemsets.resize(size + 1);
      for (auto const &[itemset, count] : counts) {
        if (count >= minSupport) {
          prevFrequentItemsets[size].insert(itemset);
          double support = (double)count / (double)transactionCount;
          this->frequentItemsets.emplace_back(itemset, support);
        }
      }
    }
  } else if (!(deoptimize & Deoptimize::NoHashTree) &&
             (deoptimize & Deoptimize::NoTransactionBitmap)) {
    for (int size = k; !prevFrequentItemsets[size - 1].empty(); size += 1) {
      auto candidates = genCandidates<ItemsetHashTree>(
          prevFrequentItemsets[size - 1], deoptimize & Deoptimize::NoPruning);
      if (candidates.empty()) {
        break;
      }
      // Count how many transactions contain any given candidate.
      std::unique_ptr<GenericRelationIterator> rit(relation->MakeScan());
      Tuple *t;
      std::map<std::vector<int>, int> counts;
      while ((t = rit->GetNextTuple()) != nullptr) {
        auto transaction = (collection::IntSet *)t->GetAttribute(itemsetAttr);
        for (const std::vector<int> &candidate :
             candidates.subset(transaction)) {
          bool containsCandidate = true;
          for (int item : candidate) {
            // Check if the item is contained by binary search.
            if (!transaction->contains(item)) {
              containsCandidate = false;
              break;
            }
          }
          if (containsCandidate) {
            counts[candidate] += 1;
          }
        }
        t->DeleteIfAllowed();
      }
      // Add any candidate to the frequent itemsets if it satisfies the
      // minimum support.
      prevFrequentItemsets.resize(size + 1);
      for (auto const &[itemset, count] : counts) {
        if (count >= minSupport) {
          prevFrequentItemsets[size].insert(itemset);
          double support = (double)count / (double)transactionCount;
          this->frequentItemsets.emplace_back(itemset, support);
        }
      }
    }
  } else {
    ItemsetBitmap transactionBitmap;
    for (int size = k; !prevFrequentItemsets[size - 1].empty(); size += 1) {
      auto candidates = genCandidates<ItemsetHashTree>(
          prevFrequentItemsets[size - 1], deoptimize & Deoptimize::NoPruning);
      if (candidates.empty()) {
        break;
      }
      // Count how many transactions contain any given candidate.
      std::unique_ptr<GenericRelationIterator> rit(relation->MakeScan());
      Tuple *t;
      std::map<std::vector<int>, int> counts;
      while ((t = rit->GetNextTuple()) != nullptr) {
        auto transaction = (collection::IntSet *)t->GetAttribute(itemsetAttr);
        if (!(deoptimize & Deoptimize::NoTransactionBitmap)) {
          transactionBitmap.reset();
          for (size_t i = 0; i < transaction->getSize(); i += 1) {
            transactionBitmap.insert(transaction->get(i));
          }
        }
        for (const std::vector<int> &candidate :
             candidates.subset(transaction)) {
          bool containsCandidate = true;
          for (int item : candidate) {
            // Check if the item is contained in the transaction by a bitmap
            // lookup.
            if (!transactionBitmap.contains(item)) {
              containsCandidate = false;
              break;
            }
          }
          if (containsCandidate) {
            counts[candidate] += 1;
          }
        }
        t->DeleteIfAllowed();
      }
      // Add any candidate to the frequent itemsets if it satisfies the
      // minimum support.
      prevFrequentItemsets.resize(size + 1);
      for (auto const &[itemset, count] : counts) {
        if (count >= minSupport) {
          prevFrequentItemsets[size].insert(itemset);
          double support = (double)count / (double)transactionCount;
          this->frequentItemsets.emplace_back(itemset, support);
        }
      }
    }
  }

  this->it = this->frequentItemsets.cbegin();

  // Setup resulting tuple type.
  this->tupleType = new TupleType(
      SecondoSystem::GetCatalog()->NumericType(frequentItemsetTupleType()));
}

// Returns the next frequent itemset as a tuple.
Tuple *aprioriLI::getNext() {
  if (this->it != this->frequentItemsets.cend()) {
    auto &[itemset, support] = *this->it;
    auto tuple = new Tuple(this->tupleType);
    tuple->PutAttribute(0, new collection::IntSet(std::set<int>(
                               itemset.cbegin(), itemset.cend())));
    tuple->PutAttribute(1, new CcReal(support));
    this->it++;
    return tuple;
  } else {
    return nullptr;
  }
}
} // namespace AssociationAnalysis