This is an automated email from the ASF dual-hosted git repository.
alenka pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 233590474f GH-48448: [Python] Implement Alphanumeric and Surrogate
text in the random schema generator (#48449)
233590474f is described below
commit 233590474f0306fbf16f1c282bdae517a482c579
Author: Hyukjin Kwon <[email protected]>
AuthorDate: Tue Jan 6 17:24:54 2026 +0900
GH-48448: [Python] Implement Alphanumeric and Surrogate text in the random
schema generator (#48449)
### Rationale for this change
To test various cases of field names:
https://github.com/apache/arrow/blob/6456944f5092dedb3f80d9bc80400e857d6571c7/python/pyarrow/tests/strategies.py#L49
It was introduced from
https://github.com/apache/arrow/commit/9da458437162574f3e0d82e4a51dc6c1589b9f94
### What changes are included in this PR?
This PR implements Alphanumeric and Surrogate text in the random schema
generator
### Are these changes tested?
Yes I tested them via:
```
PYARROW_TEST_HYPOTHESIS=1 pytest -xvs
pyarrow/tests/test_strategies.py::test_fields --hypothesis-show-statistics
```
### Are there any user-facing changes?
No, test-only.
* GitHub Issue: #48448
Authored-by: Hyukjin Kwon <[email protected]>
Signed-off-by: AlenkaF <[email protected]>
---
python/pyarrow/tests/strategies.py | 31 +++++++++++++++++++++++++++----
1 file changed, 27 insertions(+), 4 deletions(-)
diff --git a/python/pyarrow/tests/strategies.py
b/python/pyarrow/tests/strategies.py
index 8319c9ce3e..3c31650ddf 100644
--- a/python/pyarrow/tests/strategies.py
+++ b/python/pyarrow/tests/strategies.py
@@ -46,7 +46,7 @@ except ImportError:
import pyarrow as pa
-# TODO(kszucs): alphanum_text, surrogate_text
+# Text generation strategies for various character sets
custom_text = st.text(
alphabet=st.characters(
min_codepoint=0x41,
@@ -54,6 +54,23 @@ custom_text = st.text(
)
)
+# alphanum_text: Only alphanumeric characters (a-z, A-Z, 0-9)
+alphanum_text = st.text(
+ alphabet=st.characters(
+ whitelist_categories=('Ll', 'Lu', 'Nd'), # Lowercase, Uppercase,
Decimal Number
+ min_codepoint=0x30, # Start from '0' (U+0030)
+ max_codepoint=0x7A # End at 'z' (U+007A)
+ )
+)
+
+# surrogate_text: Unicode supplementary planes (U+10000 to U+10FFFF)
+surrogate_text = st.text(
+ alphabet=st.characters(
+ min_codepoint=0x10000, # Start of Plane 1 (Supplementary Multilingual
Plane)
+ max_codepoint=0x10FFFF # End of valid Unicode range (last code point)
+ )
+)
+
null_type = st.just(pa.null())
bool_type = st.just(pa.bool_())
@@ -164,8 +181,10 @@ metadata = st.dictionaries(st.text(), st.text())
@st.composite
-def fields(draw, type_strategy=primitive_types):
- name = draw(custom_text)
+def fields(draw, type_strategy=primitive_types, name_strategy=None):
+ if name_strategy is None:
+ name_strategy = custom_text
+ name = draw(name_strategy)
typ = draw(type_strategy)
if pa.types.is_null(typ):
nullable = True
@@ -243,7 +262,11 @@ all_types = st.deferred(
struct_types(all_types)
)
)
-all_fields = fields(all_types)
+all_fields = st.one_of(
+ fields(all_types), # custom_text
+ fields(all_types, name_strategy=alphanum_text),
+ fields(all_types, name_strategy=surrogate_text)
+)
all_schemas = schemas(all_types)