Re: [PR] GH-48941: [C++] Generate proper UTF-8 strings in JSON test utilities [arrow]

via GitHub Mon, 26 Jan 2026 06:28:33 -0800


pitrou commented on code in PR #48943:
URL: https://github.com/apache/arrow/pull/48943#discussion_r2727822015



##########
cpp/src/arrow/json/test_common.h:
##########
@@ -110,20 +110,85 @@ struct GenerateImpl {
     return OK(writer.Double(val));
   }
 
-  Status GenerateAscii(const DataType&) {
-    auto size = std::poisson_distribution<>{4}(e);
-    std::uniform_int_distribution<uint16_t> gen_char(32, 126);  // FIXME 
generate UTF8
-    std::string s(size, '\0');
-    for (char& ch : s) ch = static_cast<char>(gen_char(e));
+  Status GenerateUtf8(const DataType&) {
+    // Generate random UTF-8 encoded strings from valid Unicode scalar values.
+    auto num_codepoints = std::poisson_distribution<>{4}(e);
+    std::string s;
+    s.reserve(num_codepoints * 3);
+
+    for (int i = 0; i < num_codepoints; ++i) {
+      uint32_t codepoint;
+      std::uniform_int_distribution<uint32_t> plane_dist(0, 3);
+      uint32_t plane = plane_dist(e);
+
+      if (plane == 0) {
+        // Basic Multilingual Plane (BMP): U+0000 to U+FFFF
+        // Exclude surrogate code points (U+D800 to U+DFFF)
+        // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 
3.8, D71)
+        // Exclude control chars below U+0020 for readability
+        // Generate from two ranges with equal probability (overrepresents the 
smaller
+        // upper range):
+        // - Lower: U+0020 to U+D7FF (55,776 values, 50% selection probability)
+        // - Upper: U+E000 to U+FFFD (8,190 values, 50% selection probability)
+        if (std::bernoulli_distribution(0.5)(e)) {
+          // Lower range: U+0020 to U+D7FF (before surrogate range)
+          codepoint = std::uniform_int_distribution<uint32_t>(0x0020, 
0xD7FF)(e);
+        } else {
+          // Upper range: U+E000 to U+FFFD (after surrogate range)
+          // Note: Stops at U+FFFD to exclude noncharacters U+FFFE and U+FFFF
+          // Other noncharacters (U+FDD0-U+FDEF, plane-ending pairs) are 
included
+          // as they are valid Unicode scalar values per the Unicode Standard
+          codepoint = std::uniform_int_distribution<uint32_t>(0xE000, 
0xFFFD)(e);
+        }
+      } else if (plane == 1) {
+        // Supplementary Multilingual Plane (SMP): U+10000 to U+1FFFF
+        // https://www.unicode.org/roadmaps/smp/
+        codepoint = std::uniform_int_distribution<uint32_t>(0x10000, 
0x1FFFF)(e);
+      } else if (plane == 2) {
+        // Supplementary Ideographic Plane (SIP): U+20000 to U+2FFFF
+        // https://www.unicode.org/roadmaps/sip/
+        codepoint = std::uniform_int_distribution<uint32_t>(0x20000, 
0x2FFFF)(e);
+      } else {
+        // Planes 3–16: U+30000–U+10FFFF
+        // Includes TIP, SSP, PUA-A, PUA-B, and unassigned planes: U+30000 to 
U+10FFFF
+        // Max valid Unicode codepoint is U+10FFFF per the Standard
+        // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 
3.4, D9)
+        codepoint = std::uniform_int_distribution<uint32_t>(0x30000, 
0x10FFFF)(e);
+      }
+
+      // Encode as UTF-8 per RFC 3629 (Section 3: UTF-8 definition)
+      // https://www.rfc-editor.org/rfc/rfc3629.html#section-3
+      if (codepoint <= 0x7F) {
+        // 1-byte sequence: 0xxxxxxx
+        s.push_back(static_cast<char>(codepoint));
+      } else if (codepoint <= 0x7FF) {
+        // 2-byte sequence: 110xxxxx 10xxxxxx
+        s.push_back(static_cast<char>(0xC0 | (codepoint >> 6)));
+        s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
+      } else if (codepoint <= 0xFFFF) {
+        // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
+        s.push_back(static_cast<char>(0xE0 | (codepoint >> 12)));
+        s.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
+        s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
+      } else {
+        // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+        s.push_back(static_cast<char>(0xF0 | (codepoint >> 18)));
+        s.push_back(static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F)));
+        s.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
+        s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
+      }
+    }
+    // Using c_str() is safe here because generation excludes U+0000 (no 
embedded nulls).
+    // U+0000 can only exist in plane 0 (BMP), and BMP generation starts at 
U+0020.
     return OK(writer.String(s.c_str()));

Review Comment:
   I think we can just call `writer.String(s)` actually.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-48941: [C++] Generate proper UTF-8 strings in JSON test utilities [arrow]

Reply via email to