Program Listing for File corpus_base.cpp¶
↰ Return to documentation for file (src/data/corpus_base.cpp
)
#include <random>
#include "common/file_utils.h"
#include "data/corpus.h"
#include "data/factored_vocab.h"
namespace marian {
namespace data {
typedef std::vector<size_t> WordBatch;
typedef std::vector<float> MaskBatch;
typedef std::pair<WordBatch, MaskBatch> WordMask;
typedef std::vector<WordMask> SentBatch;
void SentenceTupleImpl::setWeights(const std::vector<float>& weights) {
if(weights.size() != 1) { // this assumes a single sentence-level weight is always fine
ABORT_IF(empty(), "Source and target sequences should be added to a tuple before data weights");
auto numWeights = weights.size();
auto numTrgWords = back().size();
// word-level weights may or may not contain a weight for EOS tokens
if(numWeights != numTrgWords && numWeights != numTrgWords - 1)
LOG(warn,
"[warn] "
"Number of weights ({}) does not match the number of target words ({}) in line #{}",
numWeights,
numTrgWords,
id_);
}
weights_ = weights;
}
CorpusIterator::CorpusIterator() : pos_(-1) {}
CorpusIterator::CorpusIterator(CorpusBase* corpus)
: corpus_(corpus), pos_(0), tup_(corpus_->next()) {}
void CorpusIterator::increment() {
tup_ = corpus_->next();
pos_++;
}
bool CorpusIterator::equal(CorpusIterator const& other) const {
return this->pos_ == other.pos_ || (!this->tup_.valid() && !other.tup_.valid());
}
const SentenceTuple& CorpusIterator::dereference() const {
return tup_;
}
// These types of corpus constructors are used in in-training validators
// (only?), so do not load additional files for guided alignment or data
// weighting.
CorpusBase::CorpusBase(const std::vector<std::string>& paths,
const std::vector<Ptr<Vocab>>& vocabs,
Ptr<Options> options,
size_t seed)
: DatasetBase(paths, options), RNGEngine(seed),
vocabs_(vocabs),
maxLength_(options_->get<size_t>("max-length")),
maxLengthCrop_(options_->get<bool>("max-length-crop")),
rightLeft_(options_->get<bool>("right-left")),
tsv_(options_->get<bool>("tsv", false)),
tsvNumInputFields_(getNumberOfTSVInputFields(options)) {
// TODO: support passing only one vocab file if we have fully-tied embeddings
if(tsv_) {
ABORT_IF(tsvNumInputFields_ != vocabs_.size(),
"Number of TSV input fields and vocab files does not agree");
} else {
ABORT_IF(paths_.size() != vocabs_.size(),
"Number of corpus files and vocab files does not agree");
}
for(auto path : paths_) {
UPtr<io::InputFileStream> strm(new io::InputFileStream(path));
ABORT_IF(strm->empty(), "File '{}' is empty", path);
files_.emplace_back(std::move(strm));
}
initEOS(/*training=*/true);
}
CorpusBase::CorpusBase(Ptr<Options> options, bool translate, size_t seed)
: DatasetBase(options), RNGEngine(seed),
maxLength_(options_->get<size_t>("max-length")),
maxLengthCrop_(options_->get<bool>("max-length-crop")),
rightLeft_(options_->get<bool>("right-left")),
tsv_(options_->get<bool>("tsv", false)),
tsvNumInputFields_(getNumberOfTSVInputFields(options)) {
bool training = !translate;
if(training)
paths_ = options_->get<std::vector<std::string>>("train-sets");
else
paths_ = options_->get<std::vector<std::string>>("input");
std::vector<std::string> vocabPaths;
if(!options_->get<std::vector<std::string>>("vocabs").empty())
vocabPaths = options_->get<std::vector<std::string>>("vocabs");
if(training) {
if(tsv_) {
ABORT_IF(!vocabPaths.empty() && tsvNumInputFields_ != vocabPaths.size(),
"Number of TSV input fields and vocab files does not agree");
} else {
ABORT_IF(!vocabPaths.empty() && paths_.size() != vocabPaths.size(),
"Number of corpus files and vocab files does not agree");
}
}
bool useGuidedAlignment = options_->get("guided-alignment", std::string("none")) != "none";
bool useDataWeighting = options_->hasAndNotEmpty("data-weighting");
if(training && tsv_) {
// For TSV input, we expect that guided-alignment or data-weighting provide the index of a TSV
// field that contains the alignments or weights.
//
// Alignments and weights for non TSV input are handled later, after vocab creation.
if(useGuidedAlignment) {
try {
alignFileIdx_ = std::stoul(options_->get<std::string>("guided-alignment"));
} catch(const std::invalid_argument& /*e*/) {
ABORT(
"For TSV input, guided-alignment must provide an index of a field with alignments. "
"The value '{}' could not be converted to an unsigned integer.",
options_->get<std::string>("guided-alignment"));
}
LOG(info, "[data] Using word alignments from TSV field no. {}", alignFileIdx_);
}
if(useDataWeighting) {
try {
weightFileIdx_ = std::stoul(options_->get<std::string>("data-weighting"));
} catch(const std::invalid_argument& /*e*/) {
ABORT(
"For TSV input, data-weighting must provide an index of a field with weights. "
"The value '{}' could not be converted to an unsigned integer.",
options_->get<std::string>("data-weighting"));
}
LOG(info, "[data] Using weights from TSV field no. {}", weightFileIdx_);
}
// check for identical or too large indices
size_t maxIndex = tsvNumInputFields_ + size_t(useGuidedAlignment) + size_t(useDataWeighting) - 1;
ABORT_IF((useGuidedAlignment && useDataWeighting && alignFileIdx_ == weightFileIdx_)
|| (useGuidedAlignment && (alignFileIdx_ > maxIndex))
|| (useDataWeighting && (weightFileIdx_ > maxIndex)),
"For TSV input, guided-alignment and data-weighting must provide an index <= {} "
"and be different",
maxIndex);
}
// run this after determining if guided alignment or data weighting is used in TSV input
initEOS(training);
// @TODO: check if size_t can be used instead of int
std::vector<int> maxVocabs = options_->get<std::vector<int>>("dim-vocabs");
// training or scoring
if(training) {
// Marian can create vocabularies automatically if no vocabularies are given or they do not
// exists under the specified paths.
//
// Possible cases:
// * -t train1 train2 -v vocab1 vocab2
// If vocab1 or vocab2 exists, they are loaded, otherwise separate .yml vocabularies are
// created only from train1 or train2 respectively.
//
// * -t train1 train2 -v vocab vocab
// If vocab exists, it is loaded, otherwise it is created from concatenated train1 and train2
// files.
//
// * -t train1 train2
// If no path is given, separate vocabularies train1.yml and train2.yml are created from
// train1 and train2 respectively.
//
// * --tsv -t train.tsv -v vocab1 vocab2
// If vocab1 or vocab2 exists, it is loaded; otherwise each vocabulary is created from the
// appropriate fields in train.tsv.
//
// * --tsv -t train.tsv -v vocab vocab
// If vocab exist, it is loaded; otherwise it is created from all fields in train.tsv.
//
// * --tsv -t train.tsv
// If no path is given, a train.tsv.yml is created from all fields in train.tsv.
//
// * cat file.tsv | --tsv -t stdin -v vocab1 vocab2
// If either vocab1 or vocab2 does not exist, an error is shown that creation of vocabularies
// from stdin is not supported.
//
// * cat file.tsv | --tsv -t stdin -v vocab vocab
// If vocab does not exist, an error is shown that creation of a vocabulary from stdin is not
// supported.
//
// * cat file.tsv | --tsv -t stdin
// As above, an error is shown that creation of a vocabulary from stdin is not supported.
//
// There is more cases for multi-encoder models not listed above.
//
if(vocabPaths.empty()) {
size_t numStreams = tsv_ ? tsvNumInputFields_ : paths_.size();
if(tsv_) {
// Creating a vocabulary from stdin is not supported
ABORT_IF(paths_[0] == "stdin" || paths_[0] == "-",
"Creating vocabularies automatically from a data stream from STDIN is not "
"supported. Create vocabularies first and provide them with --vocabs");
// Creating a vocab from a TSV input (from STDIN or a file) with alignments or weights is
// not supported
ABORT_IF(useGuidedAlignment,
"Creating vocabularies automatically from TSV data with alignments is not "
"supported. Create vocabularies first and provide them with --vocabs");
ABORT_IF(useDataWeighting,
"Creating vocabularies automatically from TSV data with weights is not "
"supported. Create vocabularies first and provide them with --vocabs");
}
if(maxVocabs.size() < paths_.size())
maxVocabs.resize(paths_.size(), 0);
LOG(info,
"[data] No vocabulary files given, trying to find or build based on training data.");
if(!tsv_)
LOG(info, "[data] Vocabularies will be built separately for each file.");
else
LOG(info, "[data] A joint vocabulary will be built from the TSV file.");
std::vector<int> vocabDims(numStreams, 0);
std::vector<std::string> vocabPaths1(numStreams);
// Create vocabs if not provided
for(size_t i = 0; i < numStreams; ++i) {
Ptr<Vocab> vocab = New<Vocab>(options_, i);
const auto& path = paths_[tsv_ ? 0 : i]; // idx 0 because there is always only one TSV file
std::vector<std::string> trainPaths = {path};
vocabPaths1[i] = path + ".yml";
vocabDims[i] = (int) vocab->loadOrCreate("", trainPaths, maxVocabs[i]);
vocabs_.emplace_back(vocab);
}
// TODO: this is not nice as it modifies the option object and needs to expose the changes
// outside the corpus as models need to know about the vocabulary size; extract the vocab
// creation functionality from the class.
options_->set("dim-vocabs", vocabDims, "vocabs", vocabPaths1);
} else { // Vocabulary paths are given
size_t numStreams = tsv_ ? tsvNumInputFields_ : paths_.size();
// Load all vocabs
size_t numVocs = vocabPaths.size();
if(maxVocabs.size() < numVocs)
maxVocabs.resize(numStreams, 0);
// Helper object for grouping training data based on vocabulary file name
struct VocabDetails {
std::set<std::string> paths; // all paths that are used for training the vocabulary
std::vector<size_t> streams; // index of the vocabulary in the --vocab option
size_t size; // the maximum vocabulary size
};
// Group training files based on vocabulary path. If the same
// vocab path corresponds to different training files, this means
// that a single vocab should combine tokens from all files.
std::map<std::string, VocabDetails> groupVocab; // vocabPath -> (trainPaths[], vocabSize)
for(size_t i = 0; i < numVocs; ++i) {
// Index 0 because there is always only a single TSV input file
groupVocab[vocabPaths[i]].paths.insert(paths_[tsv_ ? 0 : i]);
groupVocab[vocabPaths[i]].streams.push_back(i);
if(groupVocab[vocabPaths[i]].size < maxVocabs[i])
groupVocab[vocabPaths[i]].size = maxVocabs[i];
}
auto vocabDims = options_->get<std::vector<int>>("dim-vocabs");
vocabDims.resize(numVocs, 0); // make sure there is as many dims as vocab paths
for(size_t i = 0; i < numVocs; ++i) {
if(tsv_) {
bool noVocabGiven = (vocabPaths[i].empty() || !filesystem::exists(vocabPaths[i]));
// Creating a vocabulary from stdin is not supported
ABORT_IF(noVocabGiven && (paths_[0] == "stdin" || paths_[0] == "-"),
"Creating vocabulary automatically from a data stream from STDIN is not "
"supported. Create vocabularies first and provide them with --vocabs");
// Creating a vocab from a TSV input (from STDIN or a file) with alignments or weights is not supported
ABORT_IF(noVocabGiven && useGuidedAlignment,
"Creating vocabularies automatically from TSV data with alignments is not "
"supported. Create vocabularies first and provide them with --vocabs");
ABORT_IF(noVocabGiven && useDataWeighting,
"Creating vocabularies automatically from TSV data with weights is not "
"supported. Create vocabularies first and provide them with --vocabs");
}
// Get the set of files that corresponds to the vocab. If the next file is the same vocab,
// it will not be created again, but just correctly loaded.
auto vocabDetails = groupVocab[vocabPaths[i]];
std::vector<std::string> groupedPaths(vocabDetails.paths.begin(), vocabDetails.paths.end());
Ptr<io::TemporaryFile> tsvTempFile; // temporary handler for cut fields from TSV input
// For a TSV input, multiple vocabularies with different names mean separate
// vocabularies for source(s) and target.
// If a vocabulary does not exist, it will be created in the next step. To be able to create
// a separate vocabulary, we cut tab-separated field(s) from the TSV file, e.g. all source
// or target sentences, into a temporary file.
if(tsv_ && groupVocab.size() > 1 && !filesystem::exists(vocabPaths[i])) {
ABORT_IF(groupedPaths.size() > 1, "There should not be multiple TSV input files!");
tsvTempFile.reset(new io::TemporaryFile(options_->get<std::string>("tempdir"), false));
LOG(info,
"[data] Cutting field(s) {} from {} into a temporary file {}",
utils::join(vocabDetails.streams, ", "),
groupedPaths[0],
tsvTempFile->getFileName());
fileutils::cut(groupedPaths[0], // Index 0 because there is only one TSV file
tsvTempFile,
vocabDetails.streams,
tsvNumInputFields_,
" "); // Notice that tab-separated fields are joined with a whitespace
groupedPaths.clear();
groupedPaths.push_back(tsvTempFile->getFileName());
}
// Load or create the vocabulary
Ptr<Vocab> vocab = New<Vocab>(options_, i);
vocabDims[i] = (int) vocab->loadOrCreate(vocabPaths[i], groupedPaths, vocabDetails.size);
vocabs_.emplace_back(vocab);
if(tsvTempFile)
tsvTempFile.reset();
}
// TODO: this is not nice as it modifies the option object and needs to expose the changes
// outside the corpus as models need to know about the vocabulary size; extract the vocab
// creation functionality from the class.
options_->set("dim-vocabs", vocabDims);
}
}
if(translate) {
ABORT_IF(vocabPaths.empty(), "Translating, but vocabularies are not given!");
size_t numVocs = vocabPaths.size();
if(maxVocabs.size() < numVocs)
maxVocabs.resize(paths_.size(), 0);
auto vocabDims = options_->get<std::vector<int>>("dim-vocabs");
vocabDims.resize(numVocs, 0);
for(size_t i = 0; i + 1 < numVocs; ++i) {
Ptr<Vocab> vocab = New<Vocab>(options_, i);
vocabDims[i] = (int) vocab->load(vocabPaths[i], maxVocabs[i]);
vocabs_.emplace_back(vocab);
}
// TODO: As above, this is not nice as it modifies the option object and needs to expose the changes
// outside the corpus as models need to know about the vocabulary size; extract the vocab
// creation functionality from the class.
options_->set("dim-vocabs", vocabDims);
}
for(auto path : paths_) {
if(path == "stdin" || path == "-")
files_.emplace_back(new std::istream(std::cin.rdbuf()));
else {
io::InputFileStream *strm = new io::InputFileStream(path);
ABORT_IF(strm->empty(), "File '{}' is empty", path);
files_.emplace_back(strm);
}
}
ABORT_IF(!tsv_ && vocabs_.size() != files_.size(),
"Number of {} files ({}) and vocab files ({}) does not agree",
training ? "corpus" : "input",
files_.size(),
vocabs_.size());
// Handle guided alignment and data weighting files. Alignments and weights in TSV input were
// handled earlier.
if(training && !tsv_) {
if(useGuidedAlignment) {
auto path = options_->get<std::string>("guided-alignment");
ABORT_IF(!filesystem::exists(path), "Alignment file does not exist");
LOG(info, "[data] Using word alignments from file {}", path);
alignFileIdx_ = (int)paths_.size();
paths_.emplace_back(path);
io::InputFileStream* strm = new io::InputFileStream(path);
ABORT_IF(strm->empty(), "File with alignments '{}' is empty", path);
files_.emplace_back(strm);
}
if(useDataWeighting) {
auto path = options_->get<std::string>("data-weighting");
ABORT_IF(!filesystem::exists(path), "Weight file does not exist");
LOG(info, "[data] Using weights from file {}", path);
weightFileIdx_ = (int)paths_.size();
paths_.emplace_back(path);
io::InputFileStream* strm = new io::InputFileStream(path);
ABORT_IF(strm->empty(), "File with weights '{}' is empty", path);
files_.emplace_back(strm);
}
}
}
void CorpusBase::addWordsToSentenceTuple(const std::string& line,
size_t batchIndex,
SentenceTupleImpl& tup) const {
// This turns a string in to a sequence of numerical word ids. Depending
// on the vocabulary type, this can be non-trivial, e.g. when SentencePiece
// is used.
Words words = vocabs_[batchIndex]->encode(line, /*addEOS =*/ addEOS_[batchIndex], inference_);
ABORT_IF(words.empty(), "Empty input sequences are presently untested");
if(maxLengthCrop_ && words.size() > maxLength_) {
words.resize(maxLength_);
if(addEOS_[batchIndex])
words.back() = vocabs_[batchIndex]->getEosId();
}
if(rightLeft_)
std::reverse(words.begin(), words.end() - 1);
tup.push_back(words);
}
void CorpusBase::addAlignmentToSentenceTuple(const std::string& line,
SentenceTupleImpl& tup) const {
ABORT_IF(rightLeft_, "Guided alignment and right-left model cannot be used together at the moment");
ABORT_IF(tup.size() != 2, "Using alignment between source and target, but sentence tuple has {} elements??", tup.size());
size_t srcEosPos = tup[0].size() - 1;
size_t tgtEosPos = tup[1].size() - 1;
auto align = WordAlignment(line, srcEosPos, tgtEosPos);
tup.setAlignment(align);
}
void CorpusBase::addWeightsToSentenceTuple(const std::string& line, SentenceTupleImpl& tup) const {
auto elements = utils::split(line, " ");
if(!elements.empty()) {
std::vector<float> weights;
for(auto& e : elements) { // Iterate weights as strings
if(maxLengthCrop_ && weights.size() >= maxLength_) // Cut if the input is going to be cut
break;
weights.emplace_back(std::stof(e)); // Add a weight converted into float
}
if(rightLeft_)
std::reverse(weights.begin(), weights.end());
tup.setWeights(weights);
}
}
void CorpusBase::addAlignmentsToBatch(Ptr<CorpusBatch> batch,
const std::vector<Sample>& batchVector) {
std::vector<WordAlignment> aligns;
int dimBatch = (int)batch->getSentenceIds().size();
aligns.reserve(dimBatch);
for(int b = 0; b < dimBatch; ++b) {
// If the batch vector is altered within marian by, for example, case augmentation,
// the guided alignments we received for this tuple cease to be valid.
// Hence skip setting alignments for that sentence tuple..
if (!batchVector[b].isAltered()) {
aligns.push_back(std::move(batchVector[b].getAlignment()));
}
}
batch->setGuidedAlignment(std::move(aligns));
}
void CorpusBase::addWeightsToBatch(Ptr<CorpusBatch> batch,
const std::vector<Sample>& batchVector) {
int dimBatch = (int)batch->size();
int trgWords = (int)batch->back()->batchWidth();
auto sentenceLevel
= options_->get<std::string>("data-weighting-type") == "sentence";
size_t size = sentenceLevel ? dimBatch : dimBatch * trgWords;
std::vector<float> weights(size, 1.f);
for(int b = 0; b < dimBatch; ++b) {
if(sentenceLevel) {
weights[b] = batchVector[b].getWeights().front();
} else {
size_t i = 0;
for(auto& w : batchVector[b].getWeights()) {
weights[b + i * dimBatch] = w;
++i;
}
}
}
batch->setDataWeights(weights);
}
void CorpusBase::initEOS(bool training = true) {
// Labels fed into sub-batches that are just class-labels, not sequence labels do not require to
// add a EOS symbol. Hence decision to add EOS is now based on input stream positions and
// correspoding input type.
// Determine the number of streams, i.e. the number of input files (if --train-sets) or fields in
// a TSV input (if --tsv). Notice that in case of a TSV input, fields that contain alignments and
// weights are *not* included.
size_t numStreams = tsv_ ? tsvNumInputFields_ : paths_.size();
addEOS_.resize(numStreams, true);
// input-types provides the input type for each input file (if --train-sets) or for each TSV field
// (if --tsv), for example: sequence, class, alignment.
auto inputTypes = options_->get<std::vector<std::string>>("input-types", {}); // empty list by default
// @TODO: think if this should be checked and processed here or in a validation step in config?
if(!inputTypes.empty()) {
if(tsv_) {
// Remove 'alignment' and 'weight' from input types.
// Note that these input types are not typical input streams with corresponding vocabularies.
// For a TSV input, they were used only to determine fields that contain alignments or weights
// and initialize guided-alignment and data-weighting options.
auto pos = std::find(inputTypes.begin(), inputTypes.end(), "alignment");
if(pos != inputTypes.end())
inputTypes.erase(pos);
pos = std::find(inputTypes.begin(), inputTypes.end(), "weight");
if(pos != inputTypes.end())
inputTypes.erase(pos);
}
// Make sure there is an input type for each stream
// and that there is an equal number of input types and streams when training
ABORT_IF((inputTypes.size() < numStreams) || (training && inputTypes.size() != numStreams),
"Input types have been specified ({}), you need to specify one per input stream ({})",
inputTypes.size(), numStreams);
}
for(int i = 0; i < numStreams; ++i)
if(inputTypes.size() > i) {
if(inputTypes[i] == "class")
addEOS_[i] = false;
else if(inputTypes[i] == "sequence")
addEOS_[i] = true;
else
ABORT("Unknown input type {}: {}", i, inputTypes[i]);
} else {
// No input type specified, assuming "sequence"
addEOS_[i] = true;
}
}
size_t CorpusBase::getNumberOfTSVInputFields(Ptr<Options> options) {
if(options->get<bool>("tsv", false)) {
size_t n = options->get<size_t>("tsv-fields", 0);
if(n > 0 && options->get("guided-alignment", std::string("none")) != "none")
--n;
if(n > 0 && options->hasAndNotEmpty("data-weighting"))
--n;
return n;
}
return 0;
}
#if 0
// experimental: hide inline-fix source tokens from cross attention
std::vector<float> SubBatch::crossMaskWithInlineFixSourceSuppressed() const
{
const auto& srcVocab = *vocab();
auto factoredVocab = vocab()->tryAs<FactoredVocab>();
size_t inlineFixGroupIndex = 0, inlineFixSrc = 0;
auto hasInlineFixFactors = factoredVocab && factoredVocab->tryGetFactor(FactoredVocab_INLINE_FIX_WHAT_serialized, /*out*/ inlineFixGroupIndex, /*out*/ inlineFixSrc);
auto fixSrcId = srcVocab[FactoredVocab_FIX_SRC_ID_TAG];
auto fixTgtId = srcVocab[FactoredVocab_FIX_TGT_ID_TAG];
auto fixEndId = srcVocab[FactoredVocab_FIX_END_ID_TAG];
auto unkId = srcVocab.getUnkId();
auto hasInlineFixTags = fixSrcId != unkId && fixTgtId != unkId && fixEndId != unkId;
auto m = mask(); // default return value, which we will modify in-place below in case we need to
if (hasInlineFixFactors || hasInlineFixTags) {
LOG_ONCE(info, "[data] Suppressing cross-attention into inline-fix source tokens");
// example: force French translation of name "frank" to always be "franck"
// - hasInlineFixFactors: "frank|is franck|it", "frank|is" cannot be cross-attended to
// - hasInlineFixTags: "<IOPEN> frank <IDELIM> franck <ICLOSE>", "frank" and all tags cannot be cross-attended to
auto dimBatch = batchSize(); // number of sentences in the batch
auto dimWidth = batchWidth(); // number of words in the longest sentence in the batch
const auto& d = data();
size_t numWords = 0;
for (size_t b = 0; b < dimBatch; b++) { // loop over batch entries
bool inside = false;
for (size_t s = 0; s < dimWidth; s++) { // loop over source positions
auto i = locate(/*batchIdx=*/b, /*wordPos=*/s);
if (!m[i])
break;
numWords++;
// keep track of entering/exiting the inline-fix source tags
auto w = d[i];
if (w == fixSrcId)
inside = true;
else if (w == fixTgtId)
inside = false;
bool wHasSrcIdFactor = hasInlineFixFactors && factoredVocab->getFactor(w, inlineFixGroupIndex) == inlineFixSrc;
if (inside || w == fixSrcId || w == fixTgtId || w == fixEndId || wHasSrcIdFactor)
m[i] = 0.0f; // decoder must not look at embedded source, nor the markup tokens
}
}
ABORT_IF(batchWords() != 0/*n/a*/ && numWords != batchWords(), "batchWords() inconsistency??");
}
return m;
}
#endif
} // namespace data
} // namespace marian