.. _program_listing_file_src_common_definitions.h: Program Listing for File definitions.h ====================================== |exhale_lsh| :ref:`Return to documentation for file ` (``src/common/definitions.h``) .. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS .. code-block:: cpp #pragma once #include "common/logging.h" #include "common/shape.h" #include "common/intrusive_ptr.h" #include #include #include #include #include #define THREAD_GUARD(body) [&]() { body; }() // test if THREAD_GUARD is necessary, remove if no problems occur. #define NodeOp(op) [=]() { op; } // helper macro to disable optimization (gcc only) // To use this, just insert DONT_OPTIMIZE right before the function definition // (e.g. where the "static" keyword would go). #ifdef __GNUC__ #define DONT_OPTIMIZE __attribute__((optimize("O0"))) #else #define DONT_OPTIMIZE // silently ignore on Visual Studio, where this is less of a problem #endif // Use these macros to enable faster floating-point math. Put them around one // or more functions. // // Usage: // MARIAN_FFAST_MATH_BEGIN // void LayerNormalization(float *arg) { *arg += 1.0; } // void SomethingElse() {} // MARIAN_FFAST_MATH_END // // ffast-math allows the compiler to assume associative arithmetic and finite // values. // // Associative arithmetic is particularly important to vectorize i.e. a sum: // for (const float f : range) sum += f; // Without ffast-math, the sum will be done one value at a time. On x86 it // still uses vector math, but only uses the first slot and wastes the rest. // // With ffast-math, the compiler can sum in batches of 4, 8, or 16 floats. // Also, it can run multiple adds in parallel e.g. vaddps has latency 4 and // throughput 0.5 on Skylake so multiple vector adds can run at once. // // On average, a vectorized sum is more numerically stable because it sums in // batches. Vectorized floats can still produce NaNs and infs (remember even // scalar operations are implemented with vector instructions). // // Allowing the compiler to assume finite values means functions like isnan or // isinf do not work as expected. Do not enable this for a function that // depends upon fully standard float behavior. // // It can also change the sign of zeros. // // Fast math also makes results more architecture dependent because different // register widths mean different results. They also depend on the compiler // and compiler version more. For example, clang <= 10 does not support the // float_control pragma below so it will still be conservative. // // There is a more conservative option for just associativity: // llvm introduced "#pragma clang fp reassociate" that goes inside a function. // However, llvm <11 considers that pragma an error so we'd need some ugly // version test (which they don't recommend) or a compilation test. Moreover, // it has to be in the function to keep scope. // gcc supports "-fassociative-math" that has to be outside a function. // I didn't find a MSVC equivalent. #if defined(_MSC_VER) #define MARIAN_FFAST_MATH_BEGIN __pragma(float_control(precise, off, push)) #define MARIAN_FFAST_MATH_END __pragma(float_control(pop)) #elif defined(__clang__) #define MARIAN_FFAST_MATH_BEGIN _Pragma("float_control(precise, off, push)") #define MARIAN_FFAST_MATH_END _Pragma("float_control(pop)") #elif defined(__GNUC__) // Also available as __attribute__((optimize("-ffast-math"))) but done as pragmas for consistency #define MARIAN_FFAST_MATH_BEGIN _Pragma("GCC push_options") _Pragma("GCC optimize(\"-ffast-math\")") #define MARIAN_FFAST_MATH_END _Pragma("GCC pop_options") #endif namespace marian { // Type to be used for all index types, e.g. for integer tensors for rows operator. // size_t would seem to be the natural choice over uint32_t but has usually 8 bytes // while uint32_t has 4 bytes. This type will be often exchanged between CPU and GPU. // This minimizes bandwith at little cost. typedef uint32_t IndexType; // @TODO: come up with better short name. "I..." stands for interface now. Here it stands // for "intrusive". Not a good overlap. template using IPtr = IntrusivePtr; template using UPtr = std::unique_ptr; // @TODO: come up with better short name. "I..." stands for interface now. template using IWeak = T*; template using Ptr = std::shared_ptr; template using Weak = std::weak_ptr; template inline Ptr New(Args&&... args) { return std::make_shared(std::forward(args)...); } template inline Ptr New(Ptr p) { return Ptr(p); } template inline IPtr INew(Args&&... args) { return IPtr(new T(std::forward(args)...)); } template inline IPtr INew(Ptr p) { return IPtr(p); } enum class DeviceType : size_t { gpu = 0, cpu = 1 }; struct DeviceId { size_t no{0}; DeviceType type{DeviceType::gpu}; DeviceId() : no{0}, type{DeviceType::gpu} {} DeviceId(size_t no_, DeviceType type_) : no(no_), type(type_) {} std::string typeAsString() const { return (type == DeviceType::gpu ? "gpu" : "cpu"); } operator std::string() const { return typeAsString() + std::to_string(no); } friend std::ostream& operator<<(std::ostream& out, DeviceId deviceId) { out << std::string(deviceId); return out; } friend bool operator==(DeviceId id1, DeviceId id2) { return id1.no == id2.no && id1.type == id2.type; } friend bool operator!=(DeviceId id1, DeviceId id2) { return !(id1 == id2); } }; // predefine a couple of devices for easier manual use const DeviceId CPU0{0, DeviceType::cpu}; const DeviceId CPU1{1, DeviceType::cpu}; const DeviceId CPU2{2, DeviceType::cpu}; const DeviceId CPU3{3, DeviceType::cpu}; const DeviceId CPU4{4, DeviceType::cpu}; const DeviceId CPU5{5, DeviceType::cpu}; const DeviceId CPU6{6, DeviceType::cpu}; const DeviceId CPU7{7, DeviceType::cpu}; const DeviceId GPU0{0, DeviceType::gpu}; const DeviceId GPU1{1, DeviceType::gpu}; const DeviceId GPU2{2, DeviceType::gpu}; const DeviceId GPU3{3, DeviceType::gpu}; const DeviceId GPU4{4, DeviceType::gpu}; const DeviceId GPU5{5, DeviceType::gpu}; const DeviceId GPU6{6, DeviceType::gpu}; const DeviceId GPU7{7, DeviceType::gpu}; // These are many small objects, hence use IntrusivePtr class TensorBase; typedef IPtr Tensor; // These are many small objects, hence use IntrusivePtr template class Chainable; typedef IPtr> Expr; class OptimizerBase; typedef Ptr OptimizerBasePtr; class ClipperBase; typedef Ptr ClipperBasePtr; class RunBase; typedef Ptr RunBasePtr; const float NEMATUS_LN_EPS = 1e-5f; } // namespace marian