libs/core/src/core/utf8.h - GCC Code Coverage Report

libs/core/src/core/utf8.h

Line	Exec	Source
1		#pragma once
2
3		#include <cstddef>
4
5		#include "assert/assert.h"
6
7
8		namespace eu::core
9		{
10		template<typename TString, typename TOnCodepointFunc>
11	✗	bool calc_utf8_to_codepoints(const TString& string, TOnCodepointFunc on_codepoint)
12		{
13		// reference: https://en.wikipedia.org/wiki/UTF-8
14		using CodePointInt = int;
15	✗	auto to_byte = [](char c)->std::byte { return static_cast<std::byte>(c); };
16
17	✗	constexpr auto mask0 = std::byte{0b10000000}; constexpr auto bit0 = std::byte{0b00000000};
18	✗	constexpr auto mask1 = std::byte{0b11000000}; constexpr auto bit1 = std::byte{0b10000000};
19	✗	constexpr auto mask2 = std::byte{0b11100000}; constexpr auto bit2 = std::byte{0b11000000};
20	✗	constexpr auto mask3 = std::byte{0b11110000}; constexpr auto bit3 = std::byte{0b11100000};
21	✗	constexpr auto mask4 = std::byte{0b11111000}; constexpr auto bit4 = std::byte{0b11110000};
22
23	✗	unsigned int bits = 0;
24	✗	std::vector<std::byte> buffer;
25
26	✗	for(auto c: string)
27		{
28	✗	const auto b = to_byte(c);
29	✗	if(bits == 0)
30		{
31	✗	if( (mask0 & b) == bit0)
32		{
33	✗	on_codepoint(std::to_integer<CodePointInt>(b));
34		}
35		else
36		{
37	✗	buffer.push_back(b);
38	✗	if((mask4 & b) == bit4)
39		{
40	✗	bits = 4;
41		}
42	✗	else if((mask3 & b) == bit3)
43		{
44	✗	bits = 3;
45		}
46	✗	else if((mask2 & b) == bit2)
47		{
48	✗	bits = 2;
49		}
50	✗	else if((mask1 & b) == bit1)
51		{
52		// stray continuation byte...
53	✗	return false;
54		}
55		else
56		{
57	✗	DIE("Implementation error?");
58	✗	return false;
59		}
60		}
61		}
62		else
63		{
64	✗	if((mask1 & b) != bit1)
65		{
66		// invalid continuation bit
67	✗	return false;
68		}
69	✗	buffer.push_back(b);
70	✗	if(buffer.size() == bits)
71		{
72	✗	switch(bits)
73		{
74	✗	case 2:
75		on_codepoint
76	✗	(
77	✗	( std::to_integer<CodePointInt>(buffer[0] & ~mask2) << 6) \|
78	✗	( std::to_integer<CodePointInt>(buffer[1] & ~mask1) << 0)
79		);
80	✗	break;
81	✗	case 3:
82		on_codepoint
83	✗	(
84	✗	( std::to_integer<CodePointInt>(buffer[0] & ~mask3) << 12) \|
85	✗	( std::to_integer<CodePointInt>(buffer[1] & ~mask1) << 6 ) \|
86	✗	( std::to_integer<CodePointInt>(buffer[2] & ~mask1) << 0 )
87		);
88	✗	break;
89	✗	case 4:
90		on_codepoint
91	✗	(
92	✗	( std::to_integer<CodePointInt>(buffer[0] & ~mask4) << 18) \|
93	✗	( std::to_integer<CodePointInt>(buffer[1] & ~mask1) << 12) \|
94	✗	( std::to_integer<CodePointInt>(buffer[2] & ~mask1) << 6 ) \|
95	✗	( std::to_integer<CodePointInt>(buffer[3] & ~mask1) << 0 )
96		);
97	✗	break;
98	✗	default:
99		// unhandled size
100	✗	return false;
101		}
102
103	✗	bits = 0;
104	✗	buffer.resize(0);
105		}
106		}
107		}
108
109		// if bits != 0 this means a unfinished codepoint
110	✗	return bits == 0;
111	✗	}
112		}
113