.. _program_listing_file_src_data_alignment.cpp: Program Listing for File alignment.cpp ====================================== |exhale_lsh| :ref:`Return to documentation for file ` (``src/data/alignment.cpp``) .. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS .. code-block:: cpp #include "data/alignment.h" #include "common/utils.h" #include #include #include namespace marian { namespace data { WordAlignment::WordAlignment() {} WordAlignment::WordAlignment(const std::vector& align) : data_(align) {} WordAlignment::WordAlignment(const std::string& line, size_t srcEosPos, size_t tgtEosPos) { std::vector atok = utils::splitAny(line, " -"); for(size_t i = 0; i < atok.size(); i += 2) data_.push_back(Point{ (size_t)std::stoi(atok[i]), (size_t)std::stoi(atok[i + 1]), 1.f }); data_.push_back(Point{ srcEosPos, tgtEosPos, 1.f }); // add alignment point for both EOS symbols } void WordAlignment::sort() { std::sort(data_.begin(), data_.end(), [](const Point& a, const Point& b) { return (a.srcPos == b.srcPos) ? a.tgtPos < b.tgtPos : a.srcPos < b.srcPos; }); } void WordAlignment::normalize(bool reverse/*=false*/) { std::vector counts; counts.reserve(data_.size()); // reverse==false : normalize target word prob by number of source words // reverse==true : normalize source word prob by number of target words auto srcOrTgt = [](const Point& p, bool reverse) { return reverse ? p.srcPos : p.tgtPos; }; for(const auto& a : data_) { size_t pos = srcOrTgt(a, reverse); if(counts.size() <= pos) counts.resize(pos + 1, 0); counts[pos]++; } // a.prob at this point is either 1 or normalized to a different value, // but we just set it to 1 / count, so multiple calls result in re-normalization // regardless of forward or reverse direction. We also set the remaining values to 1. for(auto& a : data_) { size_t pos = srcOrTgt(a, reverse); if(counts[pos] > 1) a.prob = 1.f / counts[pos]; else a.prob = 1.f; } } std::string WordAlignment::toString() const { std::stringstream str; for(auto p = begin(); p != end(); ++p) { if(p != begin()) str << " "; str << p->srcPos << "-" << p->tgtPos; } return str.str(); } WordAlignment ConvertSoftAlignToHardAlign(const SoftAlignment& alignSoft, float threshold /*= 1.f*/) { WordAlignment align; // Alignments by maximum value if(threshold == 1.f) { for(size_t t = 0; t < alignSoft.size(); ++t) { // Retrieved alignments are in reversed order size_t maxArg = 0; for(size_t s = 0; s < alignSoft[0].size(); ++s) { if(alignSoft[t][maxArg] < alignSoft[t][s]) { maxArg = s; } } align.push_back(maxArg, t, 1.f); } } else { // Alignments by greather-than-threshold for(size_t t = 0; t < alignSoft.size(); ++t) { // Retrieved alignments are in reversed order for(size_t s = 0; s < alignSoft[0].size(); ++s) { if(alignSoft[t][s] > threshold) { align.push_back(s, t, alignSoft[t][s]); } } } } // Sort alignment pairs in ascending order align.sort(); return align; } std::string SoftAlignToString(SoftAlignment align) { std::stringstream str; bool first = true; for(size_t t = 0; t < align.size(); ++t) { if(!first) str << " "; for(size_t s = 0; s < align[t].size(); ++s) { if(s != 0) str << ","; str << align[t][s]; } first = false; } return str.str(); } } // namespace data } // namespace marian