GCC Code Coverage Report


./
Coverage:
low: ≥ 0%
medium: ≥ 75.0%
high: ≥ 90.0%
Lines:
0 of 108, 0 excluded
0.0%
Functions:
0 of 4, 0 excluded
0.0%
Branches:
0 of 76, 0 excluded
0.0%

libs/core/src/core/utf8.h
Line Branch Exec Source
1 #pragma once
2
3 #include <cstddef>
4
5 #include "assert/assert.h"
6
7
8 namespace eu::core
9 {
10 template<typename TString, typename TOnCodepointFunc>
11 bool calc_utf8_to_codepoints(const TString& string, TOnCodepointFunc on_codepoint)
12 {
13 // reference: https://en.wikipedia.org/wiki/UTF-8
14 using CodePointInt = int;
15 auto to_byte = [](char c)->std::byte { return static_cast<std::byte>(c); };
16
17 constexpr auto mask0 = std::byte{0b10000000}; constexpr auto bit0 = std::byte{0b00000000};
18 constexpr auto mask1 = std::byte{0b11000000}; constexpr auto bit1 = std::byte{0b10000000};
19 constexpr auto mask2 = std::byte{0b11100000}; constexpr auto bit2 = std::byte{0b11000000};
20 constexpr auto mask3 = std::byte{0b11110000}; constexpr auto bit3 = std::byte{0b11100000};
21 constexpr auto mask4 = std::byte{0b11111000}; constexpr auto bit4 = std::byte{0b11110000};
22
23 unsigned int bits = 0;
24 std::vector<std::byte> buffer;
25
26 for(auto c: string)
27 {
28 const auto b = to_byte(c);
29 if(bits == 0)
30 {
31 if( (mask0 & b) == bit0)
32 {
33 on_codepoint(std::to_integer<CodePointInt>(b));
34 }
35 else
36 {
37 buffer.push_back(b);
38 if((mask4 & b) == bit4)
39 {
40 bits = 4;
41 }
42 else if((mask3 & b) == bit3)
43 {
44 bits = 3;
45 }
46 else if((mask2 & b) == bit2)
47 {
48 bits = 2;
49 }
50 else if((mask1 & b) == bit1)
51 {
52 // stray continuation byte...
53 return false;
54 }
55 else
56 {
57 DIE("Implementation error?");
58 return false;
59 }
60 }
61 }
62 else
63 {
64 if((mask1 & b) != bit1)
65 {
66 // invalid continuation bit
67 return false;
68 }
69 buffer.push_back(b);
70 if(buffer.size() == bits)
71 {
72 switch(bits)
73 {
74 case 2:
75 on_codepoint
76 (
77 ( std::to_integer<CodePointInt>(buffer[0] & ~mask2) << 6) |
78 ( std::to_integer<CodePointInt>(buffer[1] & ~mask1) << 0)
79 );
80 break;
81 case 3:
82 on_codepoint
83 (
84 ( std::to_integer<CodePointInt>(buffer[0] & ~mask3) << 12) |
85 ( std::to_integer<CodePointInt>(buffer[1] & ~mask1) << 6 ) |
86 ( std::to_integer<CodePointInt>(buffer[2] & ~mask1) << 0 )
87 );
88 break;
89 case 4:
90 on_codepoint
91 (
92 ( std::to_integer<CodePointInt>(buffer[0] & ~mask4) << 18) |
93 ( std::to_integer<CodePointInt>(buffer[1] & ~mask1) << 12) |
94 ( std::to_integer<CodePointInt>(buffer[2] & ~mask1) << 6 ) |
95 ( std::to_integer<CodePointInt>(buffer[3] & ~mask1) << 0 )
96 );
97 break;
98 default:
99 // unhandled size
100 return false;
101 }
102
103 bits = 0;
104 buffer.resize(0);
105 }
106 }
107 }
108
109 // if bits != 0 this means a unfinished codepoint
110 return bits == 0;
111 }
112 }
113