libs/core/src/core/utf8.h
| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #pragma once | ||
| 2 | |||
| 3 | #include <cstddef> | ||
| 4 | |||
| 5 | #include "assert/assert.h" | ||
| 6 | |||
| 7 | |||
| 8 | namespace eu::core | ||
| 9 | { | ||
| 10 | template<typename TString, typename TOnCodepointFunc> | ||
| 11 | ✗ | bool calc_utf8_to_codepoints(const TString& string, TOnCodepointFunc on_codepoint) | |
| 12 | { | ||
| 13 | // reference: https://en.wikipedia.org/wiki/UTF-8 | ||
| 14 | using CodePointInt = int; | ||
| 15 | ✗ | auto to_byte = [](char c)->std::byte { return static_cast<std::byte>(c); }; | |
| 16 | |||
| 17 | ✗ | constexpr auto mask0 = std::byte{0b10000000}; constexpr auto bit0 = std::byte{0b00000000}; | |
| 18 | ✗ | constexpr auto mask1 = std::byte{0b11000000}; constexpr auto bit1 = std::byte{0b10000000}; | |
| 19 | ✗ | constexpr auto mask2 = std::byte{0b11100000}; constexpr auto bit2 = std::byte{0b11000000}; | |
| 20 | ✗ | constexpr auto mask3 = std::byte{0b11110000}; constexpr auto bit3 = std::byte{0b11100000}; | |
| 21 | ✗ | constexpr auto mask4 = std::byte{0b11111000}; constexpr auto bit4 = std::byte{0b11110000}; | |
| 22 | |||
| 23 | ✗ | unsigned int bits = 0; | |
| 24 | ✗ | std::vector<std::byte> buffer; | |
| 25 | |||
| 26 | ✗ | for(auto c: string) | |
| 27 | { | ||
| 28 | ✗ | const auto b = to_byte(c); | |
| 29 | ✗ | if(bits == 0) | |
| 30 | { | ||
| 31 | ✗ | if( (mask0 & b) == bit0) | |
| 32 | { | ||
| 33 | ✗ | on_codepoint(std::to_integer<CodePointInt>(b)); | |
| 34 | } | ||
| 35 | else | ||
| 36 | { | ||
| 37 | ✗ | buffer.push_back(b); | |
| 38 | ✗ | if((mask4 & b) == bit4) | |
| 39 | { | ||
| 40 | ✗ | bits = 4; | |
| 41 | } | ||
| 42 | ✗ | else if((mask3 & b) == bit3) | |
| 43 | { | ||
| 44 | ✗ | bits = 3; | |
| 45 | } | ||
| 46 | ✗ | else if((mask2 & b) == bit2) | |
| 47 | { | ||
| 48 | ✗ | bits = 2; | |
| 49 | } | ||
| 50 | ✗ | else if((mask1 & b) == bit1) | |
| 51 | { | ||
| 52 | // stray continuation byte... | ||
| 53 | ✗ | return false; | |
| 54 | } | ||
| 55 | else | ||
| 56 | { | ||
| 57 | ✗ | DIE("Implementation error?"); | |
| 58 | ✗ | return false; | |
| 59 | } | ||
| 60 | } | ||
| 61 | } | ||
| 62 | else | ||
| 63 | { | ||
| 64 | ✗ | if((mask1 & b) != bit1) | |
| 65 | { | ||
| 66 | // invalid continuation bit | ||
| 67 | ✗ | return false; | |
| 68 | } | ||
| 69 | ✗ | buffer.push_back(b); | |
| 70 | ✗ | if(buffer.size() == bits) | |
| 71 | { | ||
| 72 | ✗ | switch(bits) | |
| 73 | { | ||
| 74 | ✗ | case 2: | |
| 75 | on_codepoint | ||
| 76 | ✗ | ( | |
| 77 | ✗ | ( std::to_integer<CodePointInt>(buffer[0] & ~mask2) << 6) | | |
| 78 | ✗ | ( std::to_integer<CodePointInt>(buffer[1] & ~mask1) << 0) | |
| 79 | ); | ||
| 80 | ✗ | break; | |
| 81 | ✗ | case 3: | |
| 82 | on_codepoint | ||
| 83 | ✗ | ( | |
| 84 | ✗ | ( std::to_integer<CodePointInt>(buffer[0] & ~mask3) << 12) | | |
| 85 | ✗ | ( std::to_integer<CodePointInt>(buffer[1] & ~mask1) << 6 ) | | |
| 86 | ✗ | ( std::to_integer<CodePointInt>(buffer[2] & ~mask1) << 0 ) | |
| 87 | ); | ||
| 88 | ✗ | break; | |
| 89 | ✗ | case 4: | |
| 90 | on_codepoint | ||
| 91 | ✗ | ( | |
| 92 | ✗ | ( std::to_integer<CodePointInt>(buffer[0] & ~mask4) << 18) | | |
| 93 | ✗ | ( std::to_integer<CodePointInt>(buffer[1] & ~mask1) << 12) | | |
| 94 | ✗ | ( std::to_integer<CodePointInt>(buffer[2] & ~mask1) << 6 ) | | |
| 95 | ✗ | ( std::to_integer<CodePointInt>(buffer[3] & ~mask1) << 0 ) | |
| 96 | ); | ||
| 97 | ✗ | break; | |
| 98 | ✗ | default: | |
| 99 | // unhandled size | ||
| 100 | ✗ | return false; | |
| 101 | } | ||
| 102 | |||
| 103 | ✗ | bits = 0; | |
| 104 | ✗ | buffer.resize(0); | |
| 105 | } | ||
| 106 | } | ||
| 107 | } | ||
| 108 | |||
| 109 | // if bits != 0 this means a unfinished codepoint | ||
| 110 | ✗ | return bits == 0; | |
| 111 | ✗ | } | |
| 112 | } | ||
| 113 |