Program Listing for File corpus.h

Return to documentation for file (src/data/corpus.h)

#pragma once

#include <fstream>
#include <iostream>
#include <random>

#include "3rd_party/threadpool.h"
#include "common/definitions.h"
#include "common/file_stream.h"
#include "common/options.h"
#include "data/alignment.h"
#include "data/batch.h"
#include "data/corpus_base.h"
#include "data/dataset.h"
#include "data/vocab.h"

namespace marian {
namespace data {

class Corpus : public CorpusBase {
private:
  std::vector<UPtr<io::TemporaryFile>> tempFiles_;
  std::vector<size_t> ids_;

  UPtr<ThreadPool> threadPool_; // thread pool for parallelized data reading

  // for shuffle-in-ram
  bool shuffleInRAM_{false};
  std::vector<std::vector<std::string>> corpusInRAM_; // // [stream][id] full copy of all data files

  void shuffleData(const std::vector<std::string>& paths);

  // for pre-processing
  size_t allCapsEvery_{0};   // if set, convert every N-th input sentence (after randomization) to all-caps (source and target)
  size_t titleCaseEvery_{0}; // ditto for title case (source only)
  void preprocessLine(std::string& line, size_t streamId, bool& altered); // altered => whether the segmentation was altered in marian

public:
  // @TODO: check if translate can be replaced by an option in options
  Corpus(Ptr<Options> options,
         bool translate = false,
         size_t seed = Config::seed);

  Corpus(std::vector<std::string> paths,
         std::vector<Ptr<Vocab>> vocabs,
         Ptr<Options> options,
         size_t seed = Config::seed);

  Sample next() override;

  void shuffle() override;

  void reset() override;

  void restore(Ptr<TrainingState>) override;

  iterator begin() override { return iterator(this); }

  iterator end() override { return iterator(); }

  std::vector<Ptr<Vocab>>& getVocabs() override { return vocabs_; }

  batch_ptr toBatch(const std::vector<Sample>& batchVector) override;
};
}  // namespace data
}  // namespace marian