Re: [I] [Rust] Support convert utf16 encoded string to utf8 string [fury]

via GitHub Wed, 10 Jul 2024 01:04:47 -0700


urlyy commented on issue #1547:
URL: https://github.com/apache/fury/issues/1547#issuecomment-2219824706


   I just wrote a demo about this , using cpp, and only big endian for UTF-16 
.I'll create a rust version soon.
   ```cpp
   #include <iostream>
   #include <vector>
   #include <codecvt>
   #include <codecvt>
   #include <locale>
   void print(uint16_t num)
   {
       int size = sizeof(num) * 8;
       for (int i = size - 1; i >= 0; --i)
       {
           printf("%d", (num >> i) & 1);
       }
       printf("\n");
   }
   
   void print8(uint8_t num)
   {
       int size = sizeof(num) * 8;
       for (int i = size - 1; i >= 0; --i)
       {
           printf("%d", (num >> i) & 1);
       }
       printf("\n");
   }
   
   std::vector<uint8_t> utf16_to_utf8(const std::vector<uint16_t> &utf16)
   {
       std::vector<uint8_t> utf8;
       for (size_t i = 0; i < utf16.size(); ++i)
       {
           uint16_t wc = utf16[i];
           // print(wc);
           if (wc < 0x80)
           {
               // 1-byte UTF-8
               utf8.push_back(static_cast<uint8_t>(wc));
               std::cout << 1 << "\n";
           }
           else if (wc < 0x800)
           {
               // 2-byte UTF-8
               // 110????? 10??????
               // need 11bit
               const uint16_t num = wc & 0b0000011111111111;
               utf8.push_back(static_cast<uint8_t>(num >> 6 | 0b11000000));
               utf8.push_back(static_cast<uint8_t>(num & 0b111111 | 
0b10000000));
               std::cout << 2 << "\n";
           }
           else if (wc >= 0xD800 && wc <= 0xDBFF)
           {
               // Surrogate pair (4-byte UTF-8)
               if (i + 1 < utf16.size())
               {
                   uint16_t wc2 = utf16[++i];
                   uint32_t code_point = (((wc - 0xD800) << 10) | (wc2 - 
0xDC00)) + 0x10000;
                   // 11110??? 10?????? 10?????? 10??????
                   // need 21 bit
                   const uint32_t num = code_point & 
0b00000000000111111111111111111111;
                   utf8.push_back(static_cast<uint8_t>(num >> 18 | 0b11110000));
                   utf8.push_back(static_cast<uint8_t>(code_point >> 12 & 
0b111111 | 0b10000000));
                   utf8.push_back(static_cast<uint8_t>(code_point >> 6 & 
0b111111 | 0b10000000));
                   utf8.push_back(static_cast<uint8_t>(code_point & 0b111111 | 
0b10000000));
                   std::cout << 3 << "\n";
               }
               else
               {
                   throw std::runtime_error("Invalid UTF-16 string");
               }
           }
           else
           {
               // 3-byte UTF-8
               // 1110???? 10?????? 10??????
               // need 16bit, as same as wc itself
               utf8.push_back(static_cast<uint8_t>(wc >> 12 | 0b11100000));
               utf8.push_back(static_cast<uint8_t>(wc >> 6 & 0b111111 | 
0b10000000));
               utf8.push_back(static_cast<uint8_t>(wc & 0b111111 | 0b10000000));
               std::cout << 4 << "\n";
           }
       }
       return utf8;
   }
   
   int main()
   {
       std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> 
convert;
       std::u16string utf16_s = convert.from_bytes("Hé€lo, 世界!😀");
       std::vector<uint16_t> utf16;
       std::cout << "=====init utf16:" << std::endl;
       for (uint16_t c : utf16_s)
       {
           printf("0x%04x,", c);
           utf16.push_back(c);
       }
       std::cout << "\n";
       std::vector<uint8_t> utf8 = utf16_to_utf8(utf16);
       std::cout << "=====utf8:" << std::endl;
       for (uint8_t byte : utf8)
       {
           printf("0x%02x,", byte);
       }
       std::cout << std::endl;
       // final UTF-8 string
       std::cout << "final string: " << std::string(utf8.begin(), utf8.end());
       return 0;
   }
   
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [I] [Rust] Support convert utf16 encoded string to utf8 string [fury]

Reply via email to