.. _program_listing_file_src_data_corpus_base.cpp: Program Listing for File corpus_base.cpp ======================================== |exhale_lsh| :ref:`Return to documentation for file ` (``src/data/corpus_base.cpp``) .. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS .. code-block:: cpp #include #include "common/file_utils.h" #include "data/corpus.h" #include "data/factored_vocab.h" namespace marian { namespace data { typedef std::vector WordBatch; typedef std::vector MaskBatch; typedef std::pair WordMask; typedef std::vector SentBatch; void SentenceTupleImpl::setWeights(const std::vector& weights) { if(weights.size() != 1) { // this assumes a single sentence-level weight is always fine ABORT_IF(empty(), "Source and target sequences should be added to a tuple before data weights"); auto numWeights = weights.size(); auto numTrgWords = back().size(); // word-level weights may or may not contain a weight for EOS tokens if(numWeights != numTrgWords && numWeights != numTrgWords - 1) LOG(warn, "[warn] " "Number of weights ({}) does not match the number of target words ({}) in line #{}", numWeights, numTrgWords, id_); } weights_ = weights; } CorpusIterator::CorpusIterator() : pos_(-1) {} CorpusIterator::CorpusIterator(CorpusBase* corpus) : corpus_(corpus), pos_(0), tup_(corpus_->next()) {} void CorpusIterator::increment() { tup_ = corpus_->next(); pos_++; } bool CorpusIterator::equal(CorpusIterator const& other) const { return this->pos_ == other.pos_ || (!this->tup_.valid() && !other.tup_.valid()); } const SentenceTuple& CorpusIterator::dereference() const { return tup_; } // These types of corpus constructors are used in in-training validators // (only?), so do not load additional files for guided alignment or data // weighting. CorpusBase::CorpusBase(const std::vector& paths, const std::vector>& vocabs, Ptr options, size_t seed) : DatasetBase(paths, options), RNGEngine(seed), vocabs_(vocabs), maxLength_(options_->get("max-length")), maxLengthCrop_(options_->get("max-length-crop")), rightLeft_(options_->get("right-left")), tsv_(options_->get("tsv", false)), tsvNumInputFields_(getNumberOfTSVInputFields(options)) { // TODO: support passing only one vocab file if we have fully-tied embeddings if(tsv_) { ABORT_IF(tsvNumInputFields_ != vocabs_.size(), "Number of TSV input fields and vocab files does not agree"); } else { ABORT_IF(paths_.size() != vocabs_.size(), "Number of corpus files and vocab files does not agree"); } for(auto path : paths_) { UPtr strm(new io::InputFileStream(path)); ABORT_IF(strm->empty(), "File '{}' is empty", path); files_.emplace_back(std::move(strm)); } initEOS(/*training=*/true); } CorpusBase::CorpusBase(Ptr options, bool translate, size_t seed) : DatasetBase(options), RNGEngine(seed), maxLength_(options_->get("max-length")), maxLengthCrop_(options_->get("max-length-crop")), rightLeft_(options_->get("right-left")), tsv_(options_->get("tsv", false)), tsvNumInputFields_(getNumberOfTSVInputFields(options)) { bool training = !translate; if(training) paths_ = options_->get>("train-sets"); else paths_ = options_->get>("input"); std::vector vocabPaths; if(!options_->get>("vocabs").empty()) vocabPaths = options_->get>("vocabs"); if(training) { if(tsv_) { ABORT_IF(!vocabPaths.empty() && tsvNumInputFields_ != vocabPaths.size(), "Number of TSV input fields and vocab files does not agree"); } else { ABORT_IF(!vocabPaths.empty() && paths_.size() != vocabPaths.size(), "Number of corpus files and vocab files does not agree"); } } bool useGuidedAlignment = options_->get("guided-alignment", std::string("none")) != "none"; bool useDataWeighting = options_->hasAndNotEmpty("data-weighting"); if(training && tsv_) { // For TSV input, we expect that guided-alignment or data-weighting provide the index of a TSV // field that contains the alignments or weights. // // Alignments and weights for non TSV input are handled later, after vocab creation. if(useGuidedAlignment) { try { alignFileIdx_ = std::stoul(options_->get("guided-alignment")); } catch(const std::invalid_argument& /*e*/) { ABORT( "For TSV input, guided-alignment must provide an index of a field with alignments. " "The value '{}' could not be converted to an unsigned integer.", options_->get("guided-alignment")); } LOG(info, "[data] Using word alignments from TSV field no. {}", alignFileIdx_); } if(useDataWeighting) { try { weightFileIdx_ = std::stoul(options_->get("data-weighting")); } catch(const std::invalid_argument& /*e*/) { ABORT( "For TSV input, data-weighting must provide an index of a field with weights. " "The value '{}' could not be converted to an unsigned integer.", options_->get("data-weighting")); } LOG(info, "[data] Using weights from TSV field no. {}", weightFileIdx_); } // check for identical or too large indices size_t maxIndex = tsvNumInputFields_ + size_t(useGuidedAlignment) + size_t(useDataWeighting) - 1; ABORT_IF((useGuidedAlignment && useDataWeighting && alignFileIdx_ == weightFileIdx_) || (useGuidedAlignment && (alignFileIdx_ > maxIndex)) || (useDataWeighting && (weightFileIdx_ > maxIndex)), "For TSV input, guided-alignment and data-weighting must provide an index <= {} " "and be different", maxIndex); } // run this after determining if guided alignment or data weighting is used in TSV input initEOS(training); // @TODO: check if size_t can be used instead of int std::vector maxVocabs = options_->get>("dim-vocabs"); // training or scoring if(training) { // Marian can create vocabularies automatically if no vocabularies are given or they do not // exists under the specified paths. // // Possible cases: // * -t train1 train2 -v vocab1 vocab2 // If vocab1 or vocab2 exists, they are loaded, otherwise separate .yml vocabularies are // created only from train1 or train2 respectively. // // * -t train1 train2 -v vocab vocab // If vocab exists, it is loaded, otherwise it is created from concatenated train1 and train2 // files. // // * -t train1 train2 // If no path is given, separate vocabularies train1.yml and train2.yml are created from // train1 and train2 respectively. // // * --tsv -t train.tsv -v vocab1 vocab2 // If vocab1 or vocab2 exists, it is loaded; otherwise each vocabulary is created from the // appropriate fields in train.tsv. // // * --tsv -t train.tsv -v vocab vocab // If vocab exist, it is loaded; otherwise it is created from all fields in train.tsv. // // * --tsv -t train.tsv // If no path is given, a train.tsv.yml is created from all fields in train.tsv. // // * cat file.tsv | --tsv -t stdin -v vocab1 vocab2 // If either vocab1 or vocab2 does not exist, an error is shown that creation of vocabularies // from stdin is not supported. // // * cat file.tsv | --tsv -t stdin -v vocab vocab // If vocab does not exist, an error is shown that creation of a vocabulary from stdin is not // supported. // // * cat file.tsv | --tsv -t stdin // As above, an error is shown that creation of a vocabulary from stdin is not supported. // // There is more cases for multi-encoder models not listed above. // if(vocabPaths.empty()) { size_t numStreams = tsv_ ? tsvNumInputFields_ : paths_.size(); if(tsv_) { // Creating a vocabulary from stdin is not supported ABORT_IF(paths_[0] == "stdin" || paths_[0] == "-", "Creating vocabularies automatically from a data stream from STDIN is not " "supported. Create vocabularies first and provide them with --vocabs"); // Creating a vocab from a TSV input (from STDIN or a file) with alignments or weights is // not supported ABORT_IF(useGuidedAlignment, "Creating vocabularies automatically from TSV data with alignments is not " "supported. Create vocabularies first and provide them with --vocabs"); ABORT_IF(useDataWeighting, "Creating vocabularies automatically from TSV data with weights is not " "supported. Create vocabularies first and provide them with --vocabs"); } if(maxVocabs.size() < paths_.size()) maxVocabs.resize(paths_.size(), 0); LOG(info, "[data] No vocabulary files given, trying to find or build based on training data."); if(!tsv_) LOG(info, "[data] Vocabularies will be built separately for each file."); else LOG(info, "[data] A joint vocabulary will be built from the TSV file."); std::vector vocabDims(numStreams, 0); std::vector vocabPaths1(numStreams); // Create vocabs if not provided for(size_t i = 0; i < numStreams; ++i) { Ptr vocab = New(options_, i); const auto& path = paths_[tsv_ ? 0 : i]; // idx 0 because there is always only one TSV file std::vector trainPaths = {path}; vocabPaths1[i] = path + ".yml"; vocabDims[i] = (int) vocab->loadOrCreate("", trainPaths, maxVocabs[i]); vocabs_.emplace_back(vocab); } // TODO: this is not nice as it modifies the option object and needs to expose the changes // outside the corpus as models need to know about the vocabulary size; extract the vocab // creation functionality from the class. options_->set("dim-vocabs", vocabDims, "vocabs", vocabPaths1); } else { // Vocabulary paths are given size_t numStreams = tsv_ ? tsvNumInputFields_ : paths_.size(); // Load all vocabs size_t numVocs = vocabPaths.size(); if(maxVocabs.size() < numVocs) maxVocabs.resize(numStreams, 0); // Helper object for grouping training data based on vocabulary file name struct VocabDetails { std::set paths; // all paths that are used for training the vocabulary std::vector streams; // index of the vocabulary in the --vocab option size_t size; // the maximum vocabulary size }; // Group training files based on vocabulary path. If the same // vocab path corresponds to different training files, this means // that a single vocab should combine tokens from all files. std::map groupVocab; // vocabPath -> (trainPaths[], vocabSize) for(size_t i = 0; i < numVocs; ++i) { // Index 0 because there is always only a single TSV input file groupVocab[vocabPaths[i]].paths.insert(paths_[tsv_ ? 0 : i]); groupVocab[vocabPaths[i]].streams.push_back(i); if(groupVocab[vocabPaths[i]].size < maxVocabs[i]) groupVocab[vocabPaths[i]].size = maxVocabs[i]; } auto vocabDims = options_->get>("dim-vocabs"); vocabDims.resize(numVocs, 0); // make sure there is as many dims as vocab paths for(size_t i = 0; i < numVocs; ++i) { if(tsv_) { bool noVocabGiven = (vocabPaths[i].empty() || !filesystem::exists(vocabPaths[i])); // Creating a vocabulary from stdin is not supported ABORT_IF(noVocabGiven && (paths_[0] == "stdin" || paths_[0] == "-"), "Creating vocabulary automatically from a data stream from STDIN is not " "supported. Create vocabularies first and provide them with --vocabs"); // Creating a vocab from a TSV input (from STDIN or a file) with alignments or weights is not supported ABORT_IF(noVocabGiven && useGuidedAlignment, "Creating vocabularies automatically from TSV data with alignments is not " "supported. Create vocabularies first and provide them with --vocabs"); ABORT_IF(noVocabGiven && useDataWeighting, "Creating vocabularies automatically from TSV data with weights is not " "supported. Create vocabularies first and provide them with --vocabs"); } // Get the set of files that corresponds to the vocab. If the next file is the same vocab, // it will not be created again, but just correctly loaded. auto vocabDetails = groupVocab[vocabPaths[i]]; std::vector groupedPaths(vocabDetails.paths.begin(), vocabDetails.paths.end()); Ptr tsvTempFile; // temporary handler for cut fields from TSV input // For a TSV input, multiple vocabularies with different names mean separate // vocabularies for source(s) and target. // If a vocabulary does not exist, it will be created in the next step. To be able to create // a separate vocabulary, we cut tab-separated field(s) from the TSV file, e.g. all source // or target sentences, into a temporary file. if(tsv_ && groupVocab.size() > 1 && !filesystem::exists(vocabPaths[i])) { ABORT_IF(groupedPaths.size() > 1, "There should not be multiple TSV input files!"); tsvTempFile.reset(new io::TemporaryFile(options_->get("tempdir"), false)); LOG(info, "[data] Cutting field(s) {} from {} into a temporary file {}", utils::join(vocabDetails.streams, ", "), groupedPaths[0], tsvTempFile->getFileName()); fileutils::cut(groupedPaths[0], // Index 0 because there is only one TSV file tsvTempFile, vocabDetails.streams, tsvNumInputFields_, " "); // Notice that tab-separated fields are joined with a whitespace groupedPaths.clear(); groupedPaths.push_back(tsvTempFile->getFileName()); } // Load or create the vocabulary Ptr vocab = New(options_, i); vocabDims[i] = (int) vocab->loadOrCreate(vocabPaths[i], groupedPaths, vocabDetails.size); vocabs_.emplace_back(vocab); if(tsvTempFile) tsvTempFile.reset(); } // TODO: this is not nice as it modifies the option object and needs to expose the changes // outside the corpus as models need to know about the vocabulary size; extract the vocab // creation functionality from the class. options_->set("dim-vocabs", vocabDims); } } if(translate) { ABORT_IF(vocabPaths.empty(), "Translating, but vocabularies are not given!"); size_t numVocs = vocabPaths.size(); if(maxVocabs.size() < numVocs) maxVocabs.resize(paths_.size(), 0); auto vocabDims = options_->get>("dim-vocabs"); vocabDims.resize(numVocs, 0); for(size_t i = 0; i + 1 < numVocs; ++i) { Ptr vocab = New(options_, i); vocabDims[i] = (int) vocab->load(vocabPaths[i], maxVocabs[i]); vocabs_.emplace_back(vocab); } // TODO: As above, this is not nice as it modifies the option object and needs to expose the changes // outside the corpus as models need to know about the vocabulary size; extract the vocab // creation functionality from the class. options_->set("dim-vocabs", vocabDims); } for(auto path : paths_) { if(path == "stdin" || path == "-") files_.emplace_back(new std::istream(std::cin.rdbuf())); else { io::InputFileStream *strm = new io::InputFileStream(path); ABORT_IF(strm->empty(), "File '{}' is empty", path); files_.emplace_back(strm); } } ABORT_IF(!tsv_ && vocabs_.size() != files_.size(), "Number of {} files ({}) and vocab files ({}) does not agree", training ? "corpus" : "input", files_.size(), vocabs_.size()); // Handle guided alignment and data weighting files. Alignments and weights in TSV input were // handled earlier. if(training && !tsv_) { if(useGuidedAlignment) { auto path = options_->get("guided-alignment"); ABORT_IF(!filesystem::exists(path), "Alignment file does not exist"); LOG(info, "[data] Using word alignments from file {}", path); alignFileIdx_ = (int)paths_.size(); paths_.emplace_back(path); io::InputFileStream* strm = new io::InputFileStream(path); ABORT_IF(strm->empty(), "File with alignments '{}' is empty", path); files_.emplace_back(strm); } if(useDataWeighting) { auto path = options_->get("data-weighting"); ABORT_IF(!filesystem::exists(path), "Weight file does not exist"); LOG(info, "[data] Using weights from file {}", path); weightFileIdx_ = (int)paths_.size(); paths_.emplace_back(path); io::InputFileStream* strm = new io::InputFileStream(path); ABORT_IF(strm->empty(), "File with weights '{}' is empty", path); files_.emplace_back(strm); } } } void CorpusBase::addWordsToSentenceTuple(const std::string& line, size_t batchIndex, SentenceTupleImpl& tup) const { // This turns a string in to a sequence of numerical word ids. Depending // on the vocabulary type, this can be non-trivial, e.g. when SentencePiece // is used. Words words = vocabs_[batchIndex]->encode(line, /*addEOS =*/ addEOS_[batchIndex], inference_); ABORT_IF(words.empty(), "Empty input sequences are presently untested"); if(maxLengthCrop_ && words.size() > maxLength_) { words.resize(maxLength_); if(addEOS_[batchIndex]) words.back() = vocabs_[batchIndex]->getEosId(); } if(rightLeft_) std::reverse(words.begin(), words.end() - 1); tup.push_back(words); } void CorpusBase::addAlignmentToSentenceTuple(const std::string& line, SentenceTupleImpl& tup) const { ABORT_IF(rightLeft_, "Guided alignment and right-left model cannot be used together at the moment"); ABORT_IF(tup.size() != 2, "Using alignment between source and target, but sentence tuple has {} elements??", tup.size()); size_t srcEosPos = tup[0].size() - 1; size_t tgtEosPos = tup[1].size() - 1; auto align = WordAlignment(line, srcEosPos, tgtEosPos); tup.setAlignment(align); } void CorpusBase::addWeightsToSentenceTuple(const std::string& line, SentenceTupleImpl& tup) const { auto elements = utils::split(line, " "); if(!elements.empty()) { std::vector weights; for(auto& e : elements) { // Iterate weights as strings if(maxLengthCrop_ && weights.size() >= maxLength_) // Cut if the input is going to be cut break; weights.emplace_back(std::stof(e)); // Add a weight converted into float } if(rightLeft_) std::reverse(weights.begin(), weights.end()); tup.setWeights(weights); } } void CorpusBase::addAlignmentsToBatch(Ptr batch, const std::vector& batchVector) { std::vector aligns; int dimBatch = (int)batch->getSentenceIds().size(); aligns.reserve(dimBatch); for(int b = 0; b < dimBatch; ++b) { // If the batch vector is altered within marian by, for example, case augmentation, // the guided alignments we received for this tuple cease to be valid. // Hence skip setting alignments for that sentence tuple.. if (!batchVector[b].isAltered()) { aligns.push_back(std::move(batchVector[b].getAlignment())); } } batch->setGuidedAlignment(std::move(aligns)); } void CorpusBase::addWeightsToBatch(Ptr batch, const std::vector& batchVector) { int dimBatch = (int)batch->size(); int trgWords = (int)batch->back()->batchWidth(); auto sentenceLevel = options_->get("data-weighting-type") == "sentence"; size_t size = sentenceLevel ? dimBatch : dimBatch * trgWords; std::vector weights(size, 1.f); for(int b = 0; b < dimBatch; ++b) { if(sentenceLevel) { weights[b] = batchVector[b].getWeights().front(); } else { size_t i = 0; for(auto& w : batchVector[b].getWeights()) { weights[b + i * dimBatch] = w; ++i; } } } batch->setDataWeights(weights); } void CorpusBase::initEOS(bool training = true) { // Labels fed into sub-batches that are just class-labels, not sequence labels do not require to // add a EOS symbol. Hence decision to add EOS is now based on input stream positions and // correspoding input type. // Determine the number of streams, i.e. the number of input files (if --train-sets) or fields in // a TSV input (if --tsv). Notice that in case of a TSV input, fields that contain alignments and // weights are *not* included. size_t numStreams = tsv_ ? tsvNumInputFields_ : paths_.size(); addEOS_.resize(numStreams, true); // input-types provides the input type for each input file (if --train-sets) or for each TSV field // (if --tsv), for example: sequence, class, alignment. auto inputTypes = options_->get>("input-types", {}); // empty list by default // @TODO: think if this should be checked and processed here or in a validation step in config? if(!inputTypes.empty()) { if(tsv_) { // Remove 'alignment' and 'weight' from input types. // Note that these input types are not typical input streams with corresponding vocabularies. // For a TSV input, they were used only to determine fields that contain alignments or weights // and initialize guided-alignment and data-weighting options. auto pos = std::find(inputTypes.begin(), inputTypes.end(), "alignment"); if(pos != inputTypes.end()) inputTypes.erase(pos); pos = std::find(inputTypes.begin(), inputTypes.end(), "weight"); if(pos != inputTypes.end()) inputTypes.erase(pos); } // Make sure there is an input type for each stream // and that there is an equal number of input types and streams when training ABORT_IF((inputTypes.size() < numStreams) || (training && inputTypes.size() != numStreams), "Input types have been specified ({}), you need to specify one per input stream ({})", inputTypes.size(), numStreams); } for(int i = 0; i < numStreams; ++i) if(inputTypes.size() > i) { if(inputTypes[i] == "class") addEOS_[i] = false; else if(inputTypes[i] == "sequence") addEOS_[i] = true; else ABORT("Unknown input type {}: {}", i, inputTypes[i]); } else { // No input type specified, assuming "sequence" addEOS_[i] = true; } } size_t CorpusBase::getNumberOfTSVInputFields(Ptr options) { if(options->get("tsv", false)) { size_t n = options->get("tsv-fields", 0); if(n > 0 && options->get("guided-alignment", std::string("none")) != "none") --n; if(n > 0 && options->hasAndNotEmpty("data-weighting")) --n; return n; } return 0; } #if 0 // experimental: hide inline-fix source tokens from cross attention std::vector SubBatch::crossMaskWithInlineFixSourceSuppressed() const { const auto& srcVocab = *vocab(); auto factoredVocab = vocab()->tryAs(); size_t inlineFixGroupIndex = 0, inlineFixSrc = 0; auto hasInlineFixFactors = factoredVocab && factoredVocab->tryGetFactor(FactoredVocab_INLINE_FIX_WHAT_serialized, /*out*/ inlineFixGroupIndex, /*out*/ inlineFixSrc); auto fixSrcId = srcVocab[FactoredVocab_FIX_SRC_ID_TAG]; auto fixTgtId = srcVocab[FactoredVocab_FIX_TGT_ID_TAG]; auto fixEndId = srcVocab[FactoredVocab_FIX_END_ID_TAG]; auto unkId = srcVocab.getUnkId(); auto hasInlineFixTags = fixSrcId != unkId && fixTgtId != unkId && fixEndId != unkId; auto m = mask(); // default return value, which we will modify in-place below in case we need to if (hasInlineFixFactors || hasInlineFixTags) { LOG_ONCE(info, "[data] Suppressing cross-attention into inline-fix source tokens"); // example: force French translation of name "frank" to always be "franck" // - hasInlineFixFactors: "frank|is franck|it", "frank|is" cannot be cross-attended to // - hasInlineFixTags: " frank franck ", "frank" and all tags cannot be cross-attended to auto dimBatch = batchSize(); // number of sentences in the batch auto dimWidth = batchWidth(); // number of words in the longest sentence in the batch const auto& d = data(); size_t numWords = 0; for (size_t b = 0; b < dimBatch; b++) { // loop over batch entries bool inside = false; for (size_t s = 0; s < dimWidth; s++) { // loop over source positions auto i = locate(/*batchIdx=*/b, /*wordPos=*/s); if (!m[i]) break; numWords++; // keep track of entering/exiting the inline-fix source tags auto w = d[i]; if (w == fixSrcId) inside = true; else if (w == fixTgtId) inside = false; bool wHasSrcIdFactor = hasInlineFixFactors && factoredVocab->getFactor(w, inlineFixGroupIndex) == inlineFixSrc; if (inside || w == fixSrcId || w == fixTgtId || w == fixEndId || wHasSrcIdFactor) m[i] = 0.0f; // decoder must not look at embedded source, nor the markup tokens } } ABORT_IF(batchWords() != 0/*n/a*/ && numWords != batchWords(), "batchWords() inconsistency??"); } return m; } #endif } // namespace data } // namespace marian