urlyy commented on issue #1547:
URL: https://github.com/apache/fury/issues/1547#issuecomment-2219824706
I just wrote a demo about this , using cpp, and only big endian for UTF-16
.I'll create a rust version soon.
```cpp
#include <iostream>
#include <vector>
#include <codecvt>
#include <codecvt>
#include <locale>
void print(uint16_t num)
{
int size = sizeof(num) * 8;
for (int i = size - 1; i >= 0; --i)
{
printf("%d", (num >> i) & 1);
}
printf("\n");
}
void print8(uint8_t num)
{
int size = sizeof(num) * 8;
for (int i = size - 1; i >= 0; --i)
{
printf("%d", (num >> i) & 1);
}
printf("\n");
}
std::vector<uint8_t> utf16_to_utf8(const std::vector<uint16_t> &utf16)
{
std::vector<uint8_t> utf8;
for (size_t i = 0; i < utf16.size(); ++i)
{
uint16_t wc = utf16[i];
// print(wc);
if (wc < 0x80)
{
// 1-byte UTF-8
utf8.push_back(static_cast<uint8_t>(wc));
std::cout << 1 << "\n";
}
else if (wc < 0x800)
{
// 2-byte UTF-8
// 110????? 10??????
// need 11bit
const uint16_t num = wc & 0b0000011111111111;
utf8.push_back(static_cast<uint8_t>(num >> 6 | 0b11000000));
utf8.push_back(static_cast<uint8_t>(num & 0b111111 |
0b10000000));
std::cout << 2 << "\n";
}
else if (wc >= 0xD800 && wc <= 0xDBFF)
{
// Surrogate pair (4-byte UTF-8)
if (i + 1 < utf16.size())
{
uint16_t wc2 = utf16[++i];
uint32_t code_point = (((wc - 0xD800) << 10) | (wc2 -
0xDC00)) + 0x10000;
// 11110??? 10?????? 10?????? 10??????
// need 21 bit
const uint32_t num = code_point &
0b00000000000111111111111111111111;
utf8.push_back(static_cast<uint8_t>(num >> 18 | 0b11110000));
utf8.push_back(static_cast<uint8_t>(code_point >> 12 &
0b111111 | 0b10000000));
utf8.push_back(static_cast<uint8_t>(code_point >> 6 &
0b111111 | 0b10000000));
utf8.push_back(static_cast<uint8_t>(code_point & 0b111111 |
0b10000000));
std::cout << 3 << "\n";
}
else
{
throw std::runtime_error("Invalid UTF-16 string");
}
}
else
{
// 3-byte UTF-8
// 1110???? 10?????? 10??????
// need 16bit, as same as wc itself
utf8.push_back(static_cast<uint8_t>(wc >> 12 | 0b11100000));
utf8.push_back(static_cast<uint8_t>(wc >> 6 & 0b111111 |
0b10000000));
utf8.push_back(static_cast<uint8_t>(wc & 0b111111 | 0b10000000));
std::cout << 4 << "\n";
}
}
return utf8;
}
int main()
{
std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>
convert;
std::u16string utf16_s = convert.from_bytes("Hé€lo, 世界!😀");
std::vector<uint16_t> utf16;
std::cout << "=====init utf16:" << std::endl;
for (uint16_t c : utf16_s)
{
printf("0x%04x,", c);
utf16.push_back(c);
}
std::cout << "\n";
std::vector<uint8_t> utf8 = utf16_to_utf8(utf16);
std::cout << "=====utf8:" << std::endl;
for (uint8_t byte : utf8)
{
printf("0x%02x,", byte);
}
std::cout << std::endl;
// final UTF-8 string
std::cout << "final string: " << std::string(utf8.begin(), utf8.end());
return 0;
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]