maartenbreddels commented on a change in pull request #7656:
URL: https://github.com/apache/arrow/pull/7656#discussion_r451435072



##########
File path: python/pyarrow/tests/test_compute.py
##########
@@ -95,6 +95,56 @@ def test_binary_contains_exact():
     assert expected.equals(result)
 
 
+# utf8proc claims 0x8be is UTF8PROC_CATEGORY_LO
+utf8proc_issue_isalpha = {0x8be, 0x8bf, 0x8c0, 0x8c1, 0x8c2, 0x8c3, 0x8c4, 
0x8c5, 0x8c6, 0x8c7, 0xd04, 0xe86, 0xe89, 0xe8c, 0xe8e, 0xe8f, 0xe90, 0xe91, 
0xe92, 0xe93, 0xe98, 0xea0, 0xea8, 0xea9, 0xeac, 0x1cf2, 0x1cf3, 0x1cfa, 
0x31bb, 0x31bc, 0x31bd, 0x31be, 0x31bf, 0x4db6, 0x4db7, 0x4db8, 0x4db9, 0x4dba, 
0x4dbb, 0x4dbc, 0x4dbd, 0x4dbe, 0x4dbf, 0x9ff0, 0x9ff1, 0x9ff2, 0x9ff3, 0x9ff4, 
0x9ff5, 0x9ff6, 0x9ff7, 0x9ff8, 0x9ff9, 0x9ffa, 0x9ffb, 0x9ffc, 0xa7ba, 0xa7bb, 
0xa7bc, 0xa7bd, 0xa7be, 0xa7bf, 0xa7c2, 0xa7c3, 0xa7c4, 0xa7c5, 0xa7c6, 0xa7c7, 
0xa7c8, 0xa7c9, 0xa7ca, 0xa7f5, 0xa7f6, 0xab66, 0xab67, 0xab68, 0xab69, 
0x10e80, 0x10e81, 0x10e82, 0x10e83, 0x10e84, 0x10e85, 0x10e86, 0x10e87, 
0x10e88, 0x10e89,
+                          0x10e8a, 0x10e8b, 0x10e8c, 0x10e8d, 0x10e8e, 
0x10e8f, 0x10e90, 0x10e91, 0x10e92, 0x10e93, 0x10e94, 0x10e95, 0x10e96, 
0x10e97, 0x10e98, 0x10e99, 0x10e9a, 0x10e9b, 0x10e9c, 0x10e9d, 0x10e9e, 
0x10e9f, 0x10ea0, 0x10ea1, 0x10ea2, 0x10ea3, 0x10ea4, 0x10ea5, 0x10ea6, 
0x10ea7, 0x10ea8, 0x10ea9, 0x10eb0, 0x10eb1, 0x10fb0, 0x10fb1, 0x10fb2, 
0x10fb3, 0x10fb4, 0x10fb5, 0x10fb6, 0x10fb7, 0x10fb8, 0x10fb9, 0x10fba, 
0x10fbb, 0x10fbc, 0x10fbd, 0x10fbe, 0x10fbf, 0x10fc0, 0x10fc1, 0x10fc2, 
0x10fc3, 0x10fc4, 0x10fe0, 0x10fe1, 0x10fe2, 0x10fe3, 0x10fe4, 0x10fe5, 
0x10fe6, 0x10fe7, 0x10fe8, 0x10fe9, 0x10fea, 0x10feb, 0x10fec, 0x10fed, 
0x10fee, 0x10fef, 0x10ff0, 0x10ff1, 0x10ff2, 0x10ff3, 0x10ff4, 0x10ff5, 
0x10ff6, }
+# utf8proc claims these are upper case, they are not
+utf8proc_issue_isupper = {0xa7ba, 0xa7bc, 0xa7be, 0xa7c2,
+                          0xa7c4, 0xa7c5, 0xa7c6, 0xa7c7, 0xa7c9, 0xa7f5, }
+# utf8proc misses quite a few, and does some false claims?
+utf8proc_issue_islower = {0xaa, 0xba, 0x2b0, 0x2b1, 0x2b2, 0x2b3, 0x2b4, 
0x2b5, 0x2b6, 0x2b7, 0x2b8, 0x2c0, 0x2c1, 0x2e0, 0x2e1, 0x2e2, 0x2e3, 0x2e4, 
0x345, 0x37a, 0x1d2c, 0x1d2d, 0x1d2e, 0x1d2f, 0x1d30, 0x1d31, 0x1d32, 0x1d33, 
0x1d34, 0x1d35, 0x1d36, 0x1d37, 0x1d38, 0x1d39, 0x1d3a, 0x1d3b, 0x1d3c, 0x1d3d, 
0x1d3e, 0x1d3f, 0x1d40, 0x1d41, 0x1d42, 0x1d43, 0x1d44, 0x1d45, 0x1d46, 0x1d47, 
0x1d48, 0x1d49, 0x1d4a, 0x1d4b, 0x1d4c, 0x1d4d, 0x1d4e, 0x1d4f, 0x1d50, 0x1d51, 
0x1d52, 0x1d53, 0x1d54, 0x1d55, 0x1d56, 0x1d57, 0x1d58, 0x1d59, 0x1d5a, 0x1d5b, 
0x1d5c, 0x1d5d, 0x1d5e, 0x1d5f, 0x1d60, 0x1d61, 0x1d62, 0x1d63, 0x1d64, 0x1d65, 
0x1d66, 0x1d67, 0x1d68, 0x1d69, 0x1d6a, 0x1d78, 0x1d9b, 0x1d9c, 0x1d9d, 0x1d9e, 
0x1d9f, 0x1da0, 0x1da1, 0x1da2, 0x1da3, 0x1da4, 0x1da5, 0x1da6, 0x1da7, 0x1da8, 
0x1da9,
+                          0x1daa, 0x1dab, 0x1dac, 0x1dad, 0x1dae, 0x1daf, 
0x1db0, 0x1db1, 0x1db2, 0x1db3, 0x1db4, 0x1db5, 0x1db6, 0x1db7, 0x1db8, 0x1db9, 
0x1dba, 0x1dbb, 0x1dbc, 0x1dbd, 0x1dbe, 0x1dbf, 0x2071, 0x207f, 0x2090, 0x2091, 
0x2092, 0x2093, 0x2094, 0x2095, 0x2096, 0x2097, 0x2098, 0x2099, 0x209a, 0x209b, 
0x209c, 0x2170, 0x2171, 0x2172, 0x2173, 0x2174, 0x2175, 0x2176, 0x2177, 0x2178, 
0x2179, 0x217a, 0x217b, 0x217c, 0x217d, 0x217e, 0x217f, 0x24d0, 0x24d1, 0x24d2, 
0x24d3, 0x24d4, 0x24d5, 0x24d6, 0x24d7, 0x24d8, 0x24d9, 0x24da, 0x24db, 0x24dc, 
0x24dd, 0x24de, 0x24df, 0x24e0, 0x24e1, 0x24e2, 0x24e3, 0x24e4, 0x24e5, 0x24e6, 
0x24e7, 0x24e8, 0x24e9, 0x2c7c, 0x2c7d, 0xa69c, 0xa69d, 0xa770, 0xa7bb, 0xa7bd, 
0xa7bf, 0xa7c3, 0xa7c8, 0xa7ca, 0xa7f6, 0xa7f8, 0xa7f9, 0xab5c, 0xab5d, 0xab5e, 
0xab5f, 0xab66, 0xab67, 0xab68, }
+# utf8proc claims 0x8be is UTF8PROC_CATEGORY_LO
+utf8proc_issue_isprintable = {0x8be, 0x8bf, 0x8c0, 0x8c1, 0x8c2, 0x8c3, 0x8c4, 
0x8c5, 0x8c6, 0x8c7, 0xb55, 0xc77, 0xd04, 0xd81, 0xe86, 0xe89, 0xe8c, 0xe8e, 
0xe8f, 0xe90, 0xe91, 0xe92, 0xe93, 0xe98, 0xea0, 0xea8, 0xea9, 0xeac, 0xeba, 
0x1abf, 0x1ac0, 0x1cfa, 0x2b97, 0x2bc9, 0x2bff, 0x2e4f, 0x2e50, 0x2e51, 0x2e52, 
0x31bb, 0x31bc, 0x31bd, 0x31be, 0x31bf, 0x32ff, 0x4db6, 0x4db7, 0x4db8, 0x4db9, 
0x4dba, 0x4dbb, 0x4dbc, 0x4dbd, 0x4dbe, 0x4dbf, 0x9ff0, 0x9ff1, 0x9ff2, 0x9ff3, 
0x9ff4, 0x9ff5, 0x9ff6, 0x9ff7, 0x9ff8, 0x9ff9, 0x9ffa, 0x9ffb, 0x9ffc, 0xa7ba, 
0xa7bb, 0xa7bc, 0xa7bd, 0xa7be, 0xa7bf, 0xa7c2, 0xa7c3, 0xa7c4, 0xa7c5, 0xa7c6, 
0xa7c7, 0xa7c8, 0xa7c9, 0xa7ca, 0xa7f5, 0xa7f6, 0xa82c, 0xab66, 0xab67, 0xab68, 
0xab69, 0xab6a, 0xab6b, 0x1019c, 0x10e80, 0x10e81, 0x10e82, 0x10e83, 0x10e84, 
0x10e85, 0x10e86, 0x10e87,
+                              0x10e88, 0x10e89, 0x10e8a, 0x10e8b, 0x10e8c, 
0x10e8d, 0x10e8e, 0x10e8f, 0x10e90, 0x10e91, 0x10e92, 0x10e93, 0x10e94, 
0x10e95, 0x10e96, 0x10e97, 0x10e98, 0x10e99, 0x10e9a, 0x10e9b, 0x10e9c, 
0x10e9d, 0x10e9e, 0x10e9f, 0x10ea0, 0x10ea1, 0x10ea2, 0x10ea3, 0x10ea4, 
0x10ea5, 0x10ea6, 0x10ea7, 0x10ea8, 0x10ea9, 0x10eab, 0x10eac, 0x10ead, 
0x10eb0, 0x10eb1, 0x10fb0, 0x10fb1, 0x10fb2, 0x10fb3, 0x10fb4, 0x10fb5, 
0x10fb6, 0x10fb7, 0x10fb8, 0x10fb9, 0x10fba, 0x10fbb, 0x10fbc, 0x10fbd, 
0x10fbe, 0x10fbf, 0x10fc0, 0x10fc1, 0x10fc2, 0x10fc3, 0x10fc4, 0x10fc5, 
0x10fc6, 0x10fc7, 0x10fc8, 0x10fc9, 0x10fca, 0x10fcb, 0x10fe0, 0x10fe1, 
0x10fe2, 0x10fe3, 0x10fe4, 0x10fe5, 0x10fe6, 0x10fe7, 0x10fe8, 0x10fe9, 
0x10fea, 0x10feb, 0x10fec, 0x10fed, 0x10fee, 0x10fef, 0x10ff0, 0x10ff1, 
0x10ff2, 0x10ff3, 0x10ff4, 0x10ff5, 0x10ff6}
+# utf8proc does not store if a codepoint is numeric
+numeric_info_missing = {0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03, 
0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96, 0x4ebf, 0x4ec0, 0x4edf, 0x4ee8, 
0x4f0d, 0x4f70, 0x5104, 0x5146, 0x5169, 0x516b, 0x516d, 0x5341, 0x5343, 0x5344, 
0x5345, 0x534c, 0x53c1, 0x53c2, 0x53c3, 0x53c4, 0x56db, 0x58f1, 0x58f9,
+                        0x5e7a, 0x5efe, 0x5eff, 0x5f0c, 0x5f0d, 0x5f0e, 
0x5f10, 0x62fe, 0x634c, 0x67d2, 0x6f06, 0x7396, 0x767e, 0x8086, 0x842c, 0x8cae, 
0x8cb3, 0x8d30, 0x9621, 0x9646, 0x964c, 0x9678, 0x96f6, 0xf96b, 0xf973, 0xf978, 
0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, 0x10fc5, 0x10fc6, 0x10fc7, 0x10fc8, 0x10fc9, 
0x10fca, 0x10fcb, }
+# utf8proc has no no digit information
+digit_info_missing = {0xb2, 0xb3, 0xb9, 0x1369, 0x136a, 0x136b, 0x136c, 
0x136d, 0x136e, 0x136f, 0x1370, 0x1371, 0x19da, 0x2070, 0x2074, 0x2075, 0x2076, 
0x2077, 0x2078, 0x2079, 0x2080, 0x2081, 0x2082, 0x2083, 0x2084, 0x2085, 0x2086, 
0x2087, 0x2088, 0x2089, 0x2460, 0x2461, 0x2462, 0x2463, 0x2464, 0x2465, 0x2466, 
0x2467, 0x2468, 0x2474, 0x2475, 0x2476, 0x2477, 0x2478, 0x2479, 0x247a, 0x247b, 
0x247c, 0x2488, 0x2489, 0x248a, 0x248b, 0x248c, 0x248d,
+                      0x248e, 0x248f, 0x2490, 0x24ea, 0x24f5, 0x24f6, 0x24f7, 
0x24f8, 0x24f9, 0x24fa, 0x24fb, 0x24fc, 0x24fd, 0x24ff, 0x2776, 0x2777, 0x2778, 
0x2779, 0x277a, 0x277b, 0x277c, 0x277d, 0x277e, 0x2780, 0x2781, 0x2782, 0x2783, 
0x2784, 0x2785, 0x2786, 0x2787, 0x2788, 0x278a, 0x278b, 0x278c, 0x278d, 0x278e, 
0x278f, 0x2790, 0x2791, 0x2792, 0x10a40, 0x10a41, 0x10a42, 0x10a43, 0x10e60, 
0x10e61, 0x10e62, 0x10e63, 0x10e64, 0x10e65, 0x10e66, 0x10e67, 0x10e68}
+
+codepoints_ignore = {
+    'isalnum': numeric_info_missing | digit_info_missing | 
utf8proc_issue_isalpha,
+    'isalpha': utf8proc_issue_isalpha,
+    'isdigit': digit_info_missing,
+    'isupper': utf8proc_issue_isupper,
+    'isprintable': utf8proc_issue_isprintable,
+    'isnumeric': numeric_info_missing,
+    'islower': utf8proc_issue_islower
+}
+
+
[email protected]('function_name', ['isalnum', 'isalpha', 'isascii', 
'isdecimal', 'isdigit', 'islower', 'isnumeric', 'isprintable', 'isspace', 
'isupper', ])
[email protected]('ascii', [False, True])
+def test_string_py_compat_boolean(function_name, ascii):
+    variant = 'ascii' if ascii else 'unicode'
+    arrow_name = f'string_{function_name}_{variant}'
+    py_name = function_name
+    for i in range(128 if ascii else 0x11000):

Review comment:
       It runs fast, so performance is not an issue. As it is now, all 
non-ascii values are 'pass through'. We didn't formally agree on this, but this 
is how we started (with `ascii_lower`) and seems like a good idea.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to