Program Listing for File fastopt.h

Return to documentation for file (src/common/fastopt.h)

#pragma once

#include "common/definitions.h"
#include "3rd_party/any_type.h"
#include "3rd_party/phf/phf.h"
#include "3rd_party/yaml-cpp/yaml.h"

// This file contains code to create a fast access option class,
// meant as a replacment/supplement to YAML::Node.

namespace marian {

namespace crc {
// has to stay in header due to constexpr

// This code comes from https://notes.underscorediscovery.com/constexpr-fnv1a/
// and is distributed as public domain as stated by the author under that link

// constants for hash computations
constexpr uint64_t val_64_const = 0xcbf29ce484222325;
constexpr uint64_t prime_64_const = 0x100000001b3;

// recursive compile-time hash, looking for stack-overflow source
inline constexpr uint64_t
hash_64_fnv1a_const(const char* const str,
                    const uint64_t value = val_64_const) noexcept {
  return (str[0] == '\0') ? value :
      hash_64_fnv1a_const(&str[1], (value ^ uint64_t(str[0])) * prime_64_const);
}

// Compile time string hashing. Should work particularly well for option look up with explicitly used keys like options->get("dim-input");
inline constexpr uint64_t crc(const char* const str) noexcept {
  return hash_64_fnv1a_const(str);
}

}

/*****************************************************************************/

// PerfectHash constructs a perfect hash for a set K of n numeric keys. The size of
// the hash is m > n (not much larger) and n << max(K) (much smaller). The output array size is
// determined by PHF::init in "src/3rd_party/phf/phf.h". m - n fields stay undefined (a bit of waste).

// Wrapper class for the 3rd-party library in "src/3rd_party/phf"
class PerfectHash {
private:
  phf phf_;

  PerfectHash(const uint64_t keys[], size_t num) {
    int error = PHF::init<uint64_t, true>(&phf_, keys, num,
      /* bucket size */ 4,
      /* loading factor */ 90,
      /* seed */ 123456);
    ABORT_IF(error != 0, "PHF error {}", error);
  }

public:

  PerfectHash(const std::vector<uint64_t>& v)
   : PerfectHash(v.data(), v.size()) { }

  ~PerfectHash() {
    PHF::destroy(&phf_);
  }

  // subscript operator [] overloading: if the key is uint64_t, return the hash code directly
  uint32_t operator[](const uint64_t& key) const {
    return PHF::hash<uint64_t>(const_cast<phf*>(&phf_), key);
  }

  // If the key is a string, return the hash code for the string's CRC code
  uint32_t operator[](const char* const keyStr) const {
    return (*this)[crc::crc(keyStr)];
  }

  size_t size() const {
    return phf_.m;
  }
};

/*****************************************************************************/

class FastOpt;

// helper class for conversion, see fastopt.cpp
namespace fastopt_helpers {
  template <typename T>
  struct As {
    static T apply(const FastOpt&);
  };

  template <typename T>
  struct As<std::vector<T>> {
    static std::vector<T> apply(const FastOpt&);
  };

  template <typename T1, typename T2>
  struct As<std::pair<T1, T2>> {
    static std::pair<T1, T2> apply(const FastOpt&);
  };
}

// Fast access option class, meant as a replacment/supplement to YAML::Node.
// Relatively expensive to construct, fast to access (not visible in profiler)
// via std::vector or perfect hash. The perfect hash only requires a few look-ups
// and arithmentic operations, still O(1).
// Still requires YAML::Node support for parsing and modification via rebuilding.
class FastOpt {
private:
  template <typename T>
  friend struct fastopt_helpers::As;

public:
  // Node types for FastOpt, seem to be enough to cover YAML:NodeType
  // Multi-element types include "Sequence" and "Map"
  // "Sequence" is implemented with STL vectors
  // "Map" is implemented with a 3rd-party PHF library (see the PerfectHash class)
  enum struct NodeType {
    Null, Bool, Int64, Float64, String, Sequence, Map
  };

private:
  any_type value_;
  std::unique_ptr<const PerfectHash> ph_;
  std::vector<std::unique_ptr<const FastOpt>> array_;
  NodeType type_{NodeType::Null};

  static const std::unique_ptr<const FastOpt> uniqueNullPtr; // return this unique_ptr if key not found, equivalent to nullptr

  uint64_t fingerprint_{0}; // When node is used as a value in a map, used to check if the perfect hash
                            // returned the right value (they can produce false positives)
  size_t elements_{0};      // Number of elements if isMap or isSequence is true, 0 otherwise.

  // Used to find elements if isSequence() is true.
  // Retrieve the entry using array indexing.
  inline const std::unique_ptr<const FastOpt>& arrayLookup(size_t keyId) const {
    if(keyId < array_.size())
      return array_[keyId];
    else
      return uniqueNullPtr;
  }

  // Used to find elements if isMap() is true.
  // Retrieve the entry from the hash table.
  inline const std::unique_ptr<const FastOpt>& phLookup(uint64_t keyId) const {
    if(ph_)
      return array_[(*ph_)[keyId]];
    else
      return uniqueNullPtr;
  }

  // Builders for different types of nodes.
  // Build Null node.
  void makeNull() {
    elements_ = 0;
    type_ = NodeType::Null;

    ABORT_IF(ph_, "ph_ should be undefined");
    ABORT_IF(!array_.empty(), "array_ should be empty");
  }

  // Build Scalar node via controlled failure to convert from a YAML::Node object.
  void makeScalar(const YAML::Node& v) {
    elements_ = 0;

    // Placeholders for decode
    bool asBool;
    int64_t asInt;
    double asDouble;

    // Text boolean values should be treated as a string
    auto asString  = v.as<std::string>();
    bool isTextBool = asString.size() == 1 && asString.find_first_of("nyNYtfTF") == 0;

    if(YAML::convert<bool>::decode(v, asBool) && !isTextBool) {
      value_ = asBool;
      type_ = NodeType::Bool;
    }
    else if(YAML::convert<int64_t>::decode(v, asInt)) {
      value_ = asInt;
      type_ = NodeType::Int64;
    }
    else if(YAML::convert<double>::decode(v, asDouble)) {
      value_ = asDouble;
      type_ = NodeType::Float64;
    }
    else {
      value_ = asString;
      type_ = NodeType::String;
    }

    ABORT_IF(ph_, "ph_ should be undefined");
    ABORT_IF(!array_.empty(), "array_ should be empty");
  }

  // Build a Sequence node, can by converted to std::vector<T> if elements can be converted to T.
  void makeSequence(const std::vector<YAML::Node>& v) {
    elements_ = v.size();
    ABORT_IF(!array_.empty(), "array_ is not empty??");
    for(size_t pos = 0; pos < v.size(); ++pos) {
      array_.emplace_back(new FastOpt(v[pos], pos));
    }
    type_ = NodeType::Sequence;

    ABORT_IF(ph_, "ph_ should be undefined");
  }

  // Build a Map node.
  void makeMap(const std::map<uint64_t, YAML::Node>& m) {
    std::vector<uint64_t> keys;
    for(const auto& it : m)
      keys.push_back(it.first);

    ABORT_IF(ph_, "ph_ is already defined??");
    ph_.reset(new PerfectHash(keys));

    ABORT_IF(!array_.empty(), "array_ is not empty??");

    // for lack of resize_emplace
    for(int i = 0; i < ph_->size(); ++i)
      array_.emplace_back(nullptr);
    elements_ = keys.size();

    for(const auto& it : m) {
      uint64_t key = it.first;
      size_t pos = (*ph_)[key];
      array_[pos].reset(new FastOpt(it.second, key));
    }

    type_ = NodeType::Map;
  }

  // Build a Map node, uses std::string as key, which gets hashed to uint64_t and used in the function above.
  void makeMap(const std::map<std::string, YAML::Node>& m) {
    std::map<uint64_t, YAML::Node> mi;
    for(const auto& it : m) {
      auto key = it.first.c_str();
      mi[crc::crc(key)] = it.second;
    }

    makeMap(mi);
  }

  // Only build from YAML::Node
  FastOpt(const FastOpt&) = delete;
  FastOpt() = delete;

  void construct(const YAML::Node& node) {
    switch(node.Type()) {
      case YAML::NodeType::Scalar:
        makeScalar(node);
        break;
      case YAML::NodeType::Sequence: {
        std::vector<YAML::Node> nodesVec;
        for(auto&& n : node)
          nodesVec.push_back(n);
        makeSequence(nodesVec);
      } break;
      case YAML::NodeType::Map: {
        std::map<std::string, YAML::Node> nodesMap;
        for(auto& n : node) {
          auto key = n.first.as<std::string>();
          nodesMap[key] = n.second;
        }
        makeMap(nodesMap);
      } break;
      case YAML::NodeType::Undefined:
      case YAML::NodeType::Null:
        makeNull();
    }
  }

public:
  // Constructor to recursively create a FastOpt object from a YAML::Node following the yaml structure.
  FastOpt(const YAML::Node& node)
  { construct(node); }

  FastOpt(const YAML::Node& node, uint64_t fingerprint)
     : fingerprint_{fingerprint}
  { construct(node); }

  // Predicates for node types
  bool isSequence() const {
    return type_ == NodeType::Sequence;
  }

  bool isMap() const {
    return type_ == NodeType::Map;
  }

  bool isScalar() const {
    return type_ == NodeType::Bool
      || type_ == NodeType::Float64
      || type_ == NodeType::Int64
      || type_ == NodeType::String;
  }

  bool isNull() const {
    return type_ == NodeType::Null;
  }

  bool isInt() const {
    return type_ == NodeType::Int64;
  }

  bool isBool() const {
    return type_ == NodeType::Bool;
  }

  bool isFloat() const {
    return type_ == NodeType::Float64;
  }

  bool isString() const {
    return type_ == NodeType::String;
  }

  // actual number of elements in a sequence or map, 0 (not 1) for scalar nodes.
  // 0 here means rather "not applicable".
  size_t size() const {
    return elements_;
  }

  // replace current node with an externally built FastOpt object
  void swap(FastOpt& other) {
    std::swap(value_, other.value_);
    std::swap(ph_, other.ph_);
    std::swap(array_, other.array_);
    std::swap(type_, other.type_);
    std::swap(elements_, other.elements_);
    // leave fingerprint alone as it needed by parent node.
  }

  // Is the hashed key in a map?
  bool has(uint64_t keyId) const {
    if(isMap() && elements_ > 0) {
      const auto& ptr = phLookup(keyId);
      return ptr ? ptr->fingerprint_ == keyId : false;
    } else {
      return false;
    }
  }

  bool has(const char* const key) const {
    return has(crc::crc(key));
  }

  bool has(const std::string& key) const {
    return has(key.c_str());
  }

  // convert to requested type
  template <typename T>
  inline T as() const {
    return fastopt_helpers::As<T>::apply(*this);
  }

  // access sequence or map element
  const FastOpt& operator[](uint64_t keyId) const {
    if(isSequence()) {
      const auto& ptr = arrayLookup((size_t)keyId);
      ABORT_IF(!ptr, "Unseen key {}" , keyId);
      return *ptr;
    } else if(isMap()) {
      const auto& ptr = phLookup(keyId);
      ABORT_IF(!ptr || ptr->fingerprint_ != keyId, "Unseen key {}", keyId);
      return *ptr;
    } else {
      ABORT("Not a sequence or map node");
    }
  }

  // operator [] overloading for non-uint64_t keys
  const FastOpt& operator[](int key) const {
    return operator[]((uint64_t)key);
  }

  const FastOpt& operator[](const char* const key) const {
    return operator[](crc::crc(key));
  }

  const FastOpt& operator[](const std::string& key) const {
    return operator[](key.c_str());
  }
};

}