.. _program_listing_file_src_common_utils.cpp: Program Listing for File utils.cpp ================================== |exhale_lsh| :ref:`Return to documentation for file ` (``src/common/utils.cpp``) .. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS .. code-block:: cpp #include "common/utils.h" #include "common/logging.h" #include "CLI/StringTools.hpp" #include #include #include #include #include #include #if defined(__unix__) || defined(__APPLE__) #include #endif #include #include // MACOS lacks HOST_NAME_MAX #ifndef HOST_NAME_MAX # if defined(_POSIX_HOST_NAME_MAX) # define HOST_NAME_MAX _POSIX_HOST_NAME_MAX # elif defined(MAXHOSTNAMELEN) # define HOST_NAME_MAX MAXHOSTNAMELEN # endif #endif namespace marian { namespace utils { void trim(std::string& s) { CLI::detail::trim(s, " \t\n"); } void trimRight(std::string& s) { CLI::detail::rtrim(s, " \t\n"); } void trimLeft(std::string& s) { CLI::detail::ltrim(s, " \t\n"); } void split(const std::string& line, /*out*/ std::vector& pieces, const std::string& del /*= " "*/, bool keepEmpty /*= false*/, bool anyOf /*= false*/) { pieces.clear(); size_t begin = 0; size_t pos = 0; std::string token; size_t delSize = anyOf ? 1 : del.size(); while(true) { pos = anyOf ? line.find_first_of(del, begin) : line.find(del, begin); if(pos == std::string::npos) break; if(pos >= begin) { token = line.substr(begin, pos - begin); if(token.size() > 0 || keepEmpty) pieces.push_back(token); } begin = pos + delSize; } if(pos >= begin) { token = line.substr(begin, pos - begin); if(token.size() > 0 || keepEmpty) pieces.push_back(token); } } // the function guarantees that the output has as many elements as requested void splitTsv(const std::string& line, std::vector& fields, size_t numFields) { fields.clear(); fields.resize(numFields); // make sure there is as many elements as requested size_t begin = 0; size_t pos = 0; for(size_t i = 0; i < numFields; ++i) { pos = line.find('\t', begin); if(pos == std::string::npos) { fields[i] = line.substr(begin); break; } fields[i] = line.substr(begin, pos - begin); begin = pos + 1; } ABORT_IF(pos != std::string::npos, "Excessive field(s) in the tab-separated line: '{}'", line); } std::vector split(const std::string& line, const std::string& del /*= " "*/, bool keepEmpty /*= false*/, bool anyOf /*= false*/) { std::vector pieces; split(line, pieces, del, keepEmpty, anyOf); return pieces; } void splitAny(const std::string& line, /*out*/ std::vector& pieces, const std::string& del /*= " "*/, bool keepEmpty /*= false*/) { split(line, pieces, del, keepEmpty, /*anyOf =*/true); } std::vector splitAny(const std::string& line, const std::string& del /*= " "*/, bool keepEmpty /*= false*/) { return split(line, del, keepEmpty, /*anyOf =*/true); } std::string join(const std::vector& words, const std::string& del /*= " "*/) { if(words.empty()) return ""; std::stringstream ss; ss << words[0]; for(size_t i = 1; i < words.size(); ++i) ss << del << words[i]; return ss.str(); } std::string join(const std::vector& nums, const std::string& del /*= " "*/) { std::vector words(nums.size()); std::transform(nums.begin(), nums.end(), words.begin(), [](size_t i) { return std::to_string(i); }); return join(words, del); } std::string join(const std::vector& nums, const std::string& del /*= " "*/, size_t prec /*= 5*/) { if(nums.empty()) return ""; std::stringstream ss; ss << std::fixed << std::setprecision(prec) << nums[0]; for(size_t i = 1; i < nums.size(); ++i) ss << del << nums[i]; return ss.str(); } // escapes a string for passing to popen, which uses /bin/sh to parse its argument string static std::string escapeForPOpen(const std::string& arg) { // e.g. abc -> 'abc'; my file.txt -> 'my file.txt'; $10 -> '$10'; it's -> 'it'\''s' return arg; // @BUGBUG: This sometimes fails with "sh: 1: Syntax error: Unterminated quoted string", // so since this is not super-critical, we will disable it for now. //return "'" + findReplace(arg, "'", "'\\''", /*all=*/ true) + "'"; } // execute an external command // The command is composed of three pieces: // - the executable path, e.g. --valid-script-path // - an optional array of arguments. Meant for options. E.g. --valid-script-args. Options with leading - can only be passed via Yaml/Json. // - one more optional single argument. Meant as the main filename argument. // Each item will be escaped for shell syntax. std::string exec(const std::string& cmd, const std::vector& args /*= {}*/, const std::string& arg /*= ""*/) { std::array buffer; std::string result; #ifdef _WIN32 #define popen _popen #define pclose _pclose #endif auto cmdLine = escapeForPOpen(cmd); for (const auto& a : args) // @TODO: proper escaping cmdLine += " " + escapeForPOpen(a); if (!arg.empty()) cmdLine += " " + escapeForPOpen(arg); //std::cerr << "###" << cmdLine << "###" << std::endl; std::shared_ptr pipe(popen(cmdLine.c_str(), "r"), pclose); if(!pipe) ABORT("popen() failed!"); while(!std::feof(pipe.get())) { if(std::fgets(buffer.data(), (int)buffer.size(), pipe.get()) != NULL) result += buffer.data(); } return result; } std::pair hostnameAndProcessId() { // helper to get hostname:pid #ifdef _WIN32 std::string hostname = getenv("COMPUTERNAME"); auto processId = (int)GetCurrentProcessId(); #else static std::string hostname = []() { // not sure if gethostname() is expensive. This way we call it only once. char hostnamebuf[HOST_NAME_MAX + 1] = {0}; gethostname(hostnamebuf, sizeof(hostnamebuf)); return std::string(hostnamebuf); }(); auto processId = (int)getpid(); #endif return {hostname, processId}; } // format a long number with comma separators std::string withCommas(size_t n) { std::string res = std::to_string(n); for(int i = (int)res.size() - 3; i > 0; i -= 3) res.insert(i, ","); return res; } bool beginsWith(const std::string& text, const std::string& prefix) { return text.size() >= prefix.size() && !text.compare(0, prefix.size(), prefix); } bool endsWith(const std::string& text, const std::string& suffix) { return text.size() >= suffix.size() && !text.compare(text.size() - suffix.size(), suffix.size(), suffix); } // @TODO: sort these functions into a separate header. std::u32string utf8ToUnicodeString(std::string const& s) { #ifdef _MSC_VER // workaround for a known bug in VS CRT std::wstring_convert, unsigned int/*char32_t*/> converter; auto res = converter.from_bytes(s); return std::u32string(res.begin(), res.end()); #else std::wstring_convert, char32_t> converter; return converter.from_bytes(s); #endif } std::string utf8FromUnicodeString(const std::u32string& s) { #ifdef _MSC_VER // workaround for a known bug in VS CRT std::wstring_convert, unsigned int/*char32_t*/> converter; std::basic_string si(s.begin(), s.end()); return converter.to_bytes(si); #else std::wstring_convert, char32_t> converter; return converter.to_bytes(s); #endif } std::u16string utf8ToUtf16String(std::string const& s) { #ifdef _MSC_VER // workaround for a known bug in VS CRT std::wstring_convert, wchar_t/*char16_t*/> converter; auto res = converter.from_bytes(s); return std::u16string(res.begin(), res.end()); #else std::wstring_convert, char16_t> converter; return converter.from_bytes(s); #endif } std::string utf8FromUtf16String(const std::u16string& s) { #ifdef _MSC_VER // workaround for a known bug in VS CRT std::wstring_convert, wchar_t/*char16_t*/> converter; std::basic_string si(s.begin(), s.end()); return converter.to_bytes(si); #else std::wstring_convert, char16_t> converter; return converter.to_bytes(s); #endif } // test whether a Unicode code point is in continuous script (e.g. Chinese or Thai) // This is used for detok bleu scoring where we have CJT characters. bool isContinuousScript(char32_t c) { // currently, this table is hand-coded, and may need to be extended when the standard grows auto in = [c](char32_t minVal, char32_t maxVal) { return c >= minVal && c <= maxVal; }; bool isHan = in(0x2E80, 0x2E99) || in(0x2E9B, 0x2EF3) || in(0x2F00, 0x2FD5) || in(0x3005, 0x3005) || in(0x3007, 0x3007) || in(0x3021, 0x3029) || in(0x3038, 0x303A) || in(0x303B, 0x303b) || in(0x3200, 0x32FF) || // Enclosed CJK Letters and Months, https://en.wikipedia.org/wiki/Enclosed_CJK_Letters_and_Months in(0x3400, 0x4DB5) || // up to here, we have a few gaps compared to sacrebleu in(0x4E00, 0x9FEF) || // sacrebleu: only up to 0x9fbb in(0xF900, 0xFA6D) || in(0xFA70, 0xFAD9) || // similar to sacrebleu in(0x20000, 0x2A6D6) || in(0x2A700, 0x2B734) || in(0x2B740, 0x2B81D) || in(0x2B820, 0x2CEA1) || in(0x2CEB0, 0x2EBE0) || // not in sacrebleu in(0x2F800, 0x2FA1D); bool isKana = in(0x3040, 0x30FF) || // Hiragana, Katakana in(0x1B000, 0x1B0FF) || // Kana supplement, https://en.wikipedia.org/wiki/Kana_Supplement in(0x1B130, 0x1B16F); // small Kana, https://en.wikipedia.org/wiki/Small_Kana_Extension bool isThai = in(0x0E00, 0x0E7F); // https://en.wikipedia.org/wiki/Thai_(Unicode_block) return isHan || isKana || isThai; // Korean characters (Hangul syllables): 0xac00..0xd7a3 // Korean subwords (Hangul Jamo): 0x1100..0x11ff [https://en.wikipedia.org/wiki/Hangul_Jamo_(Unicode_block)] // Sacrebleu uses characters units for Chinese characters; specifically, these ranges: /* (ranges as used in sacrebleuy.py) uchar >= u'\u2600' and uchar <= u'\u27bf' ## missing above uchar >= u'\u2e80' # CJK Radicals Supplement uchar >= u'\u2f00' and uchar <= u'\u2fdf' # Kangxi Radicals uchar >= u'\u2ff0' # Chinese character structure uchar >= u'\u3000' and uchar <= u'\u303f' # CJK punctuation mark ## 3040..30ff = Kana uchar >= u'\u3100' and uchar <= u'\u312f' # Phonetic symbols uchar >= u'\u31a0' # Phonetic symbols (Taiwanese and Hakka expansion) uchar >= u'\u31c0' and uchar <= u'\u31ef' # CJK stroke uchar >= u'\u3200' and uchar <= u'\u4db5' # CJK Unified Ideographs Extension A, release 3.0 uchar >= u'\u4e00' # CJK Unified Ideographs, release 1.1 uchar >= u'\u9fa6' and uchar <= u'\u9fbb' # CJK Unified Ideographs, release 4.1 uchar >= u'\uf900' and uchar <= u'\ufa2d' # CJK Compatibility Ideographs, release 1.1 uchar >= u'\ufa30' and uchar <= u'\ufa6a' # CJK Compatibility Ideographs, release 3.2 uchar >= u'\ufa70' and uchar <= u'\ufad9' # CJK Compatibility Ideographs, release 4.1 uchar >= u'\ufe10' and uchar <= u'\ufe1f' ## missing above uchar >= u'\ufe30' and uchar <= u'\ufe4f' ## missing above uchar >= u'\uff00' and uchar <= u'\uffef' # Full width ASCII, full width of English punctuation, half width Katakana, half wide half width kana, Korean alphabet uchar >= u'\u20000' and uchar <= u'\u2a6d6' # CJK Unified Ideographs Extension B, release 3.1 uchar >= u'\u2f800' and uchar <= u'\u2fa1d' # CJK Compatibility Supplement, release 3.1 */ } // convert UTF-8 characters to lower or upper case struct UTF8Mapper { // can't use the standard lib functions because MS-internal Philly servers do not have UTF-8 locale installed std::map toUpperMap, toLowerMap; UTF8Mapper() { /* env LC_ALL=en_US.UTF-8 sed 's/$.$/\1\n/g' TEXT_FILE_CONTAINING_ALL_CHARS > l env LC_ALL=en_US.UTF-8 sed 's/$.$/\U\1\n/g' TEXT_FILE_CONTAINING_ALL_CHARS > u paste l u | env LC_ALL=en_US.UTF-8 sort -u > x cat x | awk '{if($1 != $2){print}}' > y cat y | tr -d '\r' \ | od -w10000 -t x1 \ | head -1 \ | sed -e 's/^0000000 /{{".x/g' -e 's/ 09 /",".x/g' -e 's/ 0a /"},{".x/g' -e 's/ 0a$/"}/' -e 's/ /.x/g' \ | tr '.' '\\' \ | xclip */ std::vector> map8{ {"\xc9\x92","\xe2\xb1\xb0"},{"\x61","\x41"},{"\xc3\xa1","\xc3\x81"},{"\xc3\xa0","\xc3\x80"},{"\xe1\xba\xaf","\xe1\xba\xae"},{"\xe1\xba\xb1","\xe1\xba\xb0"},{"\xe1\xba\xb5","\xe1\xba\xb4"},{"\xe1\xba\xb3","\xe1\xba\xb2"},{"\xe1\xba\xb7","\xe1\xba\xb6"},{"\xc4\x83","\xc4\x82"},{"\xe1\xba\xa5","\xe1\xba\xa4"},{"\xe1\xba\xa7","\xe1\xba\xa6"},{"\xe1\xba\xab","\xe1\xba\xaa"},{"\xe1\xba\xa9","\xe1\xba\xa8"},{"\xe1\xba\xad","\xe1\xba\xac"},{"\xc3\xa2","\xc3\x82"},{"\xc7\x8e","\xc7\x8d"},{"\xc7\xbb","\xc7\xba"},{"\xc3\xa5","\xc3\x85"},{"\xc7\x9f","\xc7\x9e"},{"\xc3\xa4","\xc3\x84"},{"\xc3\xa3","\xc3\x83"},{"\xc4\x85","\xc4\x84"},{"\xc4\x81","\xc4\x80"},{"\xe1\xba\xa3","\xe1\xba\xa2"},{"\xc8\x83","\xc8\x82"},{"\xe1\xba\xa1","\xe1\xba\xa0"},{"\xc7\xa3","\xc7\xa2"},{"\xc3\xa6","\xc3\x86"},{"\x62","\x42"},{"\xe1\xb8\x87","\xe1\xb8\x86"},{"\x63","\x43"},{"\xc4\x87","\xc4\x86"},{"\xc4\x89","\xc4\x88"},{"\xc4\x8d","\xc4\x8c"},{"\xc4\x8b","\xc4\x8a"},{"\xc3\xa7","\xc3\x87"},{"\x64","\x44"},{"\xc4\x8f","\xc4\x8e"},{"\xc4\x91","\xc4\x90"},{"\xe1\xb8\x91","\xe1\xb8\x90"},{"\xe1\xb8\x8d","\xe1\xb8\x8c"},{"\xe1\xb8\x8f","\xe1\xb8\x8e"},{"\xc3\xb0","\xc3\x90"},{"\x65","\x45"},{"\xc3\xa9","\xc3\x89"},{"\xc3\xa8","\xc3\x88"},{"\xc4\x95","\xc4\x94"},{"\xe1\xba\xbf","\xe1\xba\xbe"},{"\xe1\xbb\x81","\xe1\xbb\x80"},{"\xe1\xbb\x85","\xe1\xbb\x84"},{"\xe1\xbb\x83","\xe1\xbb\x82"},{"\xe1\xbb\x87","\xe1\xbb\x86"},{"\xc3\xaa","\xc3\x8a"},{"\xc4\x9b","\xc4\x9a"},{"\xc3\xab","\xc3\x8b"},{"\xe1\xba\xbd","\xe1\xba\xbc"},{"\xc4\x97","\xc4\x96"},{"\xc4\x99","\xc4\x98"},{"\xe1\xb8\x97","\xe1\xb8\x96"},{"\xc4\x93","\xc4\x92"},{"\xe1\xba\xbb","\xe1\xba\xba"},{"\xc8\x87","\xc8\x86"},{"\xe1\xba\xb9","\xe1\xba\xb8"},{"\xc7\x9d","\xc6\x8e"},{"\x66","\x46"},{"\x67","\x47"},{"\xc7\xb5","\xc7\xb4"},{"\xc4\x9f","\xc4\x9e"},{"\xc4\x9d","\xc4\x9c"},{"\xc7\xa7","\xc7\xa6"},{"\xc4\xa1","\xc4\xa0"},{"\xc4\xa3","\xc4\xa2"},{"\xc9\xa0","\xc6\x93"},{"\x68","\x48"},{"\xc4\xa5","\xc4\xa4"},{"\xc4\xa7","\xc4\xa6"},{"\xe1\xb8\xa9","\xe1\xb8\xa8"},{"\xe1\xb8\xa5","\xe1\xb8\xa4"},{"\xe1\xb8\xab","\xe1\xb8\xaa"},{"\x69","\x49"},{"\xc4\xb1","\x49"},{"\xc3\xad","\xc3\x8d"},{"\xc3\xac","\xc3\x8c"},{"\xc4\xad","\xc4\xac"},{"\xc3\xae","\xc3\x8e"},{"\xc7\x90","\xc7\x8f"},{"\xc3\xaf","\xc3\x8f"},{"\xc4\xa9","\xc4\xa8"},{"\xc4\xaf","\xc4\xae"},{"\xc4\xab","\xc4\xaa"},{"\xe1\xbb\x89","\xe1\xbb\x88"},{"\xc8\x8b","\xc8\x8a"},{"\xe1\xbb\x8b","\xe1\xbb\x8a"},{"\x6a","\x4a"},{"\xc4\xb5","\xc4\xb4"},{"\x6b","\x4b"},{"\xe1\xb8\xb1","\xe1\xb8\xb0"},{"\xc4\xb7","\xc4\xb6"},{"\xe1\xb8\xb3","\xe1\xb8\xb2"},{"\xc6\x99","\xc6\x98"},{"\x6c","\x4c"},{"\xc4\xba","\xc4\xb9"},{"\xc4\xbe","\xc4\xbd"},{"\xc5\x82","\xc5\x81"},{"\xc4\xbc","\xc4\xbb"},{"\xe1\xb8\xb7","\xe1\xb8\xb6"},{"\x6d","\x4d"},{"\xe1\xb8\xbf","\xe1\xb8\xbe"},{"\xe1\xb9\x83","\xe1\xb9\x82"},{"\xc5\x8b","\xc5\x8a"},{"\x6e","\x4e"},{"\xc5\x84","\xc5\x83"},{"\xc5\x88","\xc5\x87"},{"\xc3\xb1","\xc3\x91"},{"\xe1\xb9\x85","\xe1\xb9\x84"},{"\xc5\x86","\xc5\x85"},{"\xe1\xb9\x87","\xe1\xb9\x86"},{"\xe1\xb9\x89","\xe1\xb9\x88"},{"\xc5\x93","\xc5\x92"},{"\x6f","\x4f"},{"\xc3\xb3","\xc3\x93"},{"\xc3\xb2","\xc3\x92"},{"\xc5\x8f","\xc5\x8e"},{"\xe1\xbb\x91","\xe1\xbb\x90"},{"\xe1\xbb\x93","\xe1\xbb\x92"},{"\xe1\xbb\x95","\xe1\xbb\x94"},{"\xe1\xbb\x99","\xe1\xbb\x98"},{"\xc3\xb4","\xc3\x94"},{"\xc7\x92","\xc7\x91"},{"\xc3\xb6","\xc3\x96"},{"\xc5\x91","\xc5\x90"},{"\xc3\xb5","\xc3\x95"},{"\xc3\xb8","\xc3\x98"},{"\xc7\xab","\xc7\xaa"},{"\xc5\x8d","\xc5\x8c"},{"\xe1\xbb\x8f","\xe1\xbb\x8e"},{"\xc8\x8f","\xc8\x8e"},{"\xe1\xbb\x8d","\xe1\xbb\x8c"},{"\xe1\xbb\x9b","\xe1\xbb\x9a"},{"\xe1\xbb\x9d","\xe1\xbb\x9c"},{"\xe1\xbb\xa1","\xe1\xbb\xa0"},{"\xe1\xbb\x9f","\xe1\xbb\x9e"},{"\xe1\xbb\xa3","\xe1\xbb\xa2"},{"\xc6\xa1","\xc6\xa0"},{"\xc9\x94","\xc6\x86"},{"\x70","\x50"},{"\xe1\xb9\x95","\xe1\xb9\x94"},{"\x71","\x51"},{"\x72","\x52"},{"\xc5\x95","\xc5\x94"},{"\xc5\x99","\xc5\x98"},{"\xc5\x97","\xc5\x96"},{"\xe1\xb9\x9b","\xe1\xb9\x9a"},{"\xe1\xb9\x9f","\xe1\xb9\x9e"},{"\x73","\x53"},{"\xc5\x9b","\xc5\x9a"},{"\xc5\x9d","\xc5\x9c"},{"\xc5\xa1","\xc5\xa0"},{"\xc5\x9f","\xc5\x9e"},{"\xe1\xb9\xa3","\xe1\xb9\xa2"},{"\x74","\x54"},{"\xc5\xa5","\xc5\xa4"},{"\xc5\xa3","\xc5\xa2"},{"\xe1\xb9\xad","\xe1\xb9\xac"},{"\xe1\xb9\xaf","\xe1\xb9\xae"},{"\xc8\x95","\xc8\x94"},{"\x75","\x55"},{"\xc3\xba","\xc3\x9a"},{"\xc3\xb9","\xc3\x99"},{"\xc5\xad","\xc5\xac"},{"\xc3\xbb","\xc3\x9b"},{"\xc7\x94","\xc7\x93"},{"\xc5\xaf","\xc5\xae"},{"\xc7\x98","\xc7\x97"},{"\xc7\x9c","\xc7\x9b"},{"\xc3\xbc","\xc3\x9c"},{"\xc5\xb1","\xc5\xb0"},{"\xc5\xa9","\xc5\xa8"},{"\xc5\xb3","\xc5\xb2"},{"\xc5\xab","\xc5\xaa"},{"\xe1\xbb\xa7","\xe1\xbb\xa6"},{"\xe1\xbb\xa5","\xe1\xbb\xa4"},{"\xe1\xb9\xb3","\xe1\xb9\xb2"},{"\xe1\xbb\xa9","\xe1\xbb\xa8"},{"\xe1\xbb\xab","\xe1\xbb\xaa"},{"\xe1\xbb\xaf","\xe1\xbb\xae"},{"\xe1\xbb\xad","\xe1\xbb\xac"},{"\xe1\xbb\xb1","\xe1\xbb\xb0"},{"\xc6\xb0","\xc6\xaf"},{"\x76","\x56"},{"\x77","\x57"},{"\xc5\xb5","\xc5\xb4"},{"\x78","\x58"},{"\xe1\xba\x8b","\xe1\xba\x8a"},{"\x79","\x59"},{"\xc3\xbd","\xc3\x9d"},{"\xe1\xbb\xb3","\xe1\xbb\xb2"},{"\xc5\xb7","\xc5\xb6"},{"\xc3\xbf","\xc5\xb8"},{"\xe1\xbb\xb9","\xe1\xbb\xb8"},{"\x7a","\x5a"},{"\xc5\xba","\xc5\xb9"},{"\xc5\xbe","\xc5\xbd"},{"\xc5\xbc","\xc5\xbb"},{"\xc6\xb6","\xc6\xb5"},{"\xe1\xba\x93","\xe1\xba\x92"},{"\xe1\xba\x95","\xe1\xba\x94"},{"\xc8\xa5","\xc8\xa4"},{"\xc3\xbe","\xc3\x9e"},{"\xca\x92","\xc6\xb7"},{"\xce\xb1","\xce\x91"},{"\xce\xac","\xce\x86"},{"\xce\xb2","\xce\x92"},{"\xce\xb3","\xce\x93"},{"\xce\xb4","\xce\x94"},{"\xce\xb5","\xce\x95"},{"\xce\xad","\xce\x88"},{"\xce\xb6","\xce\x96"},{"\xce\xb7","\xce\x97"},{"\xce\xae","\xce\x89"},{"\xce\xb8","\xce\x98"},{"\xce\xb9","\xce\x99"},{"\xce\xaf","\xce\x8a"},{"\xcf\x8a","\xce\xaa"},{"\xce\xba","\xce\x9a"},{"\xce\xbb","\xce\x9b"},{"\xce\xbc","\xce\x9c"},{"\xce\xbd","\xce\x9d"},{"\xce\xbe","\xce\x9e"},{"\xce\xbf","\xce\x9f"},{"\xcf\x8c","\xce\x8c"},{"\xcf\x80","\xce\xa0"},{"\xcf\x83","\xce\xa3"},{"\xcf\x82","\xce\xa3"},{"\xcf\x84","\xce\xa4"},{"\xcf\x85","\xce\xa5"},{"\xcf\x8d","\xce\x8e"},{"\xcf\x8b","\xce\xab"},{"\xcf\x86","\xce\xa6"},{"\xcf\x87","\xce\xa7"},{"\xcf\x88","\xce\xa8"},{"\xcf\x89","\xce\xa9"},{"\xcf\x8e","\xce\x8f"},{"\xd0\xb0","\xd0\x90"},{"\xd3\x93","\xd3\x92"},{"\xd3\x95","\xd3\x94"},{"\xd0\xb1","\xd0\x91"},{"\xd0\xb2","\xd0\x92"},{"\xd0\xb3","\xd0\x93"},{"\xd2\x93","\xd2\x92"},{"\xd2\x91","\xd2\x90"},{"\xd0\xb4","\xd0\x94"},{"\xd1\x93","\xd0\x83"},{"\xd1\x92","\xd0\x82"},{"\xd0\xb5","\xd0\x95"},{"\xd1\x90","\xd0\x80"},{"\xd3\x99","\xd3\x98"},{"\xd1\x94","\xd0\x84"},{"\xd1\x91","\xd0\x81"},{"\xd0\xb6","\xd0\x96"},{"\xd0\xb7","\xd0\x97"},{"\xd2\x99","\xd2\x98"},{"\xd1\x95","\xd0\x85"},{"\xd0\xb8","\xd0\x98"},{"\xd3\xa3","\xd3\xa2"},{"\xd1\x96","\xd0\x86"},{"\xd1\x97","\xd0\x87"},{"\xd0\xb9","\xd0\x99"},{"\xd1\x98","\xd0\x88"},{"\xd0\xba","\xd0\x9a"},{"\xd2\x9b","\xd2\x9a"},{"\xd3\x84","\xd3\x83"},{"\xd2\xa1","\xd2\xa0"},{"\xd0\xbb","\xd0\x9b"},{"\xd1\x99","\xd0\x89"},{"\xd0\xbc","\xd0\x9c"},{"\xd0\xbd","\xd0\x9d"},{"\xd2\xa3","\xd2\xa2"},{"\xd1\x9a","\xd0\x8a"},{"\xd0\xbe","\xd0\x9e"},{"\xd3\xa7","\xd3\xa6"},{"\xd3\xa9","\xd3\xa8"},{"\xd0\xbf","\xd0\x9f"},{"\xd1\x80","\xd0\xa0"},{"\xd1\x81","\xd0\xa1"},{"\xd2\xab","\xd2\xaa"},{"\xd1\x82","\xd0\xa2"},{"\xd1\x9c","\xd0\x8c"},{"\xd1\x9b","\xd0\x8b"},{"\xd1\x83","\xd0\xa3"},{"\xd3\xb1","\xd3\xb0"},{"\xd2\xb1","\xd2\xb0"},{"\xd2\xaf","\xd2\xae"},{"\xd1\x9e","\xd0\x8e"},{"\xd1\x84","\xd0\xa4"},{"\xd1\x85","\xd0\xa5"},{"\xd2\xb3","\xd2\xb2"},{"\xd2\xbb","\xd2\xba"},{"\xd1\x86","\xd0\xa6"},{"\xd1\x87","\xd0\xa7"},{"\xd1\x9f","\xd0\x8f"},{"\xd1\x88","\xd0\xa8"},{"\xd1\x89","\xd0\xa9"},{"\xd1\x8a","\xd0\xaa"},{"\xd1\x8b","\xd0\xab"},{"\xd1\x8c","\xd0\xac"},{"\xd1\x8d","\xd0\xad"},{"\xd1\x8e","\xd0\xae"},{"\xd1\x8f","\xd0\xaf"},{"\xd5\xa1","\xd4\xb1"},{"\xd5\xa3","\xd4\xb3"},{"\xd5\xa5","\xd4\xb5"},{"\xd5\xab","\xd4\xbb"},{"\xd5\xac","\xd4\xbc"},{"\xd5\xb2","\xd5\x82"},{"\xd5\xb8","\xd5\x88"},{"\xd5\xbd","\xd5\x8d"},{"\xd5\xbe","\xd5\x8e"},{"\xd5\xbf","\xd5\x8f"},{"\xd6\x80","\xd5\x90"},{"\xd6\x81","\xd5\x91"} }; for (auto p8 : map8) { auto from = utf8ToUnicodeString(p8.first); auto to = utf8ToUnicodeString(p8.second); ABORT_IF(from.size() != 1 || to.size() != 1, "Incorrect character encoding??"); toUpperMap.insert(std::make_pair(from.front(), to.front())); toLowerMap.insert(std::make_pair(to.front(), from.front())); } } char32_t toUpperOrLower(char32_t c, bool toLower) const { return mapChar(toLower ? toLowerMap : toUpperMap, c); } private: static char32_t mapChar(const std::map& map, char32_t c) { auto iter = map.find(c); if (iter == map.end()) return c; else return iter->second; } }; // shared implementation of toUpper, toLower, and toCapitalized static std::string utf8ToUpperOrLower(const std::string& s, bool toLower, bool toInitCap) { static UTF8Mapper utf8Mapper; auto ws = utf8ToUnicodeString(s); for (auto& c : ws) { c = utf8Mapper.toUpperOrLower(c, toLower); if (toInitCap) toLower = true; } return utf8FromUnicodeString(ws); } std::string utf8ToUpper(const std::string& s) { return utf8ToUpperOrLower(s, /*toLower=*/false, /*toInitCap=*/false); } std::string utf8ToLower(const std::string& s) { return utf8ToUpperOrLower(s, /*toLower=*/true , /*toInitCap=*/false); } std::string utf8Capitalized(const std::string& s) { return utf8ToUpperOrLower(s, /*toLower=*/false, /*toInitCap=*/true ); } // convert an English sentence to title case // Since title case is an English thing, we only consider ASCII characters. std::string toEnglishTitleCase(const std::string& s) { auto res = s; // process token by token const std::string wordStartChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; const std::string wordInternalChars = wordStartChars + "'"; // don't title-case letters after word-internal apostrophe const std::set exceptions = { // from moses-scripts/scripts/recaser/detruecase.perl "a","after","against","al-.+","and","any","as","at","be","because","between","by","during","el-.+","for","from","his","in","is","its","last","not","of","off","on","than","the","their","this","to","was","were","which","will","with" }; const std::set wordPredChars = {' ', '"', '\'', '-'}; // only capitalize words if following these characters (to avoid upper-casing word-internal SPM units) // These are tokenization heuristics, which may be incomplete. size_t epos = 0; for(size_t pos = epos; pos < res.size(); pos = epos) { // locate the next word pos = res.find_first_of(wordStartChars, pos); // find first letter if (pos == std::string::npos) break; epos = res.find_first_not_of(wordInternalChars, pos + 1); // find first non-letter if (epos == std::string::npos) epos = res.size(); auto word = res.substr(pos, epos - pos); // further checks of the word if (res[pos] < 'a' || res[pos] > 'z') // skip if already upper-case continue; if (pos > 0 && wordPredChars.find(res[pos-1]) == wordPredChars.end()) // skip if unexpected char before the word continue; if (exceptions.find(word) != exceptions.end()) // skip if in the exception list continue; // upper-case it res[pos] -= 'a' - 'A'; } return res; } std::string findReplace(const std::string& in, const std::string& what, const std::string& withWhat, bool all /*= false*/) { std::string res = in; for(size_t pos = res.find(what); pos != std::string::npos; pos = res.find(what, pos + withWhat.length())) { res.replace(pos, what.length(), withWhat); if (!all) break; } return res; } double parseDouble(std::string s) { double res; char c; // dummy char -- if we succeed to parse this, then there were extraneous characters after the number auto rc = sscanf(s.c_str(), "%lf%c", &res, &c); ABORT_IF(rc != 1, "Mal-formed number: {}", s); return res; } // parses a user-friendly number that can have commas and (some) units double parseNumber(std::string param) { // get unit prefix double factor = 1.; if(!param.empty() && param.back() >= 'A') { switch(param.back()) { case 'k': factor = 1.e3; break; case 'K': factor = 1.e3; break; // not technically correct but often used for k case 'M': factor = 1.e6; break; case 'G': factor = 1.e9; break; case 'T': factor = 1.e12; break; default: ABORT("Invalid or unsupported unit prefix '{}' in {}", param.back(), param); } param.pop_back(); } // we allow users to place commas in numbers (note: we are not actually verifying that they are in // the right place) auto it = std::remove_if(param.begin(), param.end(), [](char c) { return c == ','; }); // use return value for future-proofing against nodiscard warning param.erase(it, param.end()); // since we have that iterator now, we might as well shrink to fit return factor * parseDouble(param); } } // namespace utils } // namespace marian