maartenbreddels commented on a change in pull request #7656:
URL: https://github.com/apache/arrow/pull/7656#discussion_r452830275
##########
File path: python/pyarrow/tests/test_compute.py
##########
@@ -95,6 +95,56 @@ def test_binary_contains_exact():
assert expected.equals(result)
+# utf8proc claims 0x8be is UTF8PROC_CATEGORY_LO
+utf8proc_issue_isalpha = {0x8be, 0x8bf, 0x8c0, 0x8c1, 0x8c2, 0x8c3, 0x8c4,
0x8c5, 0x8c6, 0x8c7, 0xd04, 0xe86, 0xe89, 0xe8c, 0xe8e, 0xe8f, 0xe90, 0xe91,
0xe92, 0xe93, 0xe98, 0xea0, 0xea8, 0xea9, 0xeac, 0x1cf2, 0x1cf3, 0x1cfa,
0x31bb, 0x31bc, 0x31bd, 0x31be, 0x31bf, 0x4db6, 0x4db7, 0x4db8, 0x4db9, 0x4dba,
0x4dbb, 0x4dbc, 0x4dbd, 0x4dbe, 0x4dbf, 0x9ff0, 0x9ff1, 0x9ff2, 0x9ff3, 0x9ff4,
0x9ff5, 0x9ff6, 0x9ff7, 0x9ff8, 0x9ff9, 0x9ffa, 0x9ffb, 0x9ffc, 0xa7ba, 0xa7bb,
0xa7bc, 0xa7bd, 0xa7be, 0xa7bf, 0xa7c2, 0xa7c3, 0xa7c4, 0xa7c5, 0xa7c6, 0xa7c7,
0xa7c8, 0xa7c9, 0xa7ca, 0xa7f5, 0xa7f6, 0xab66, 0xab67, 0xab68, 0xab69,
0x10e80, 0x10e81, 0x10e82, 0x10e83, 0x10e84, 0x10e85, 0x10e86, 0x10e87,
0x10e88, 0x10e89,
+ 0x10e8a, 0x10e8b, 0x10e8c, 0x10e8d, 0x10e8e,
0x10e8f, 0x10e90, 0x10e91, 0x10e92, 0x10e93, 0x10e94, 0x10e95, 0x10e96,
0x10e97, 0x10e98, 0x10e99, 0x10e9a, 0x10e9b, 0x10e9c, 0x10e9d, 0x10e9e,
0x10e9f, 0x10ea0, 0x10ea1, 0x10ea2, 0x10ea3, 0x10ea4, 0x10ea5, 0x10ea6,
0x10ea7, 0x10ea8, 0x10ea9, 0x10eb0, 0x10eb1, 0x10fb0, 0x10fb1, 0x10fb2,
0x10fb3, 0x10fb4, 0x10fb5, 0x10fb6, 0x10fb7, 0x10fb8, 0x10fb9, 0x10fba,
0x10fbb, 0x10fbc, 0x10fbd, 0x10fbe, 0x10fbf, 0x10fc0, 0x10fc1, 0x10fc2,
0x10fc3, 0x10fc4, 0x10fe0, 0x10fe1, 0x10fe2, 0x10fe3, 0x10fe4, 0x10fe5,
0x10fe6, 0x10fe7, 0x10fe8, 0x10fe9, 0x10fea, 0x10feb, 0x10fec, 0x10fed,
0x10fee, 0x10fef, 0x10ff0, 0x10ff1, 0x10ff2, 0x10ff3, 0x10ff4, 0x10ff5,
0x10ff6, }
+# utf8proc claims these are upper case, they are not
+utf8proc_issue_isupper = {0xa7ba, 0xa7bc, 0xa7be, 0xa7c2,
+ 0xa7c4, 0xa7c5, 0xa7c6, 0xa7c7, 0xa7c9, 0xa7f5, }
+# utf8proc misses quite a few, and does some false claims?
+utf8proc_issue_islower = {0xaa, 0xba, 0x2b0, 0x2b1, 0x2b2, 0x2b3, 0x2b4,
0x2b5, 0x2b6, 0x2b7, 0x2b8, 0x2c0, 0x2c1, 0x2e0, 0x2e1, 0x2e2, 0x2e3, 0x2e4,
0x345, 0x37a, 0x1d2c, 0x1d2d, 0x1d2e, 0x1d2f, 0x1d30, 0x1d31, 0x1d32, 0x1d33,
0x1d34, 0x1d35, 0x1d36, 0x1d37, 0x1d38, 0x1d39, 0x1d3a, 0x1d3b, 0x1d3c, 0x1d3d,
0x1d3e, 0x1d3f, 0x1d40, 0x1d41, 0x1d42, 0x1d43, 0x1d44, 0x1d45, 0x1d46, 0x1d47,
0x1d48, 0x1d49, 0x1d4a, 0x1d4b, 0x1d4c, 0x1d4d, 0x1d4e, 0x1d4f, 0x1d50, 0x1d51,
0x1d52, 0x1d53, 0x1d54, 0x1d55, 0x1d56, 0x1d57, 0x1d58, 0x1d59, 0x1d5a, 0x1d5b,
0x1d5c, 0x1d5d, 0x1d5e, 0x1d5f, 0x1d60, 0x1d61, 0x1d62, 0x1d63, 0x1d64, 0x1d65,
0x1d66, 0x1d67, 0x1d68, 0x1d69, 0x1d6a, 0x1d78, 0x1d9b, 0x1d9c, 0x1d9d, 0x1d9e,
0x1d9f, 0x1da0, 0x1da1, 0x1da2, 0x1da3, 0x1da4, 0x1da5, 0x1da6, 0x1da7, 0x1da8,
0x1da9,
+ 0x1daa, 0x1dab, 0x1dac, 0x1dad, 0x1dae, 0x1daf,
0x1db0, 0x1db1, 0x1db2, 0x1db3, 0x1db4, 0x1db5, 0x1db6, 0x1db7, 0x1db8, 0x1db9,
0x1dba, 0x1dbb, 0x1dbc, 0x1dbd, 0x1dbe, 0x1dbf, 0x2071, 0x207f, 0x2090, 0x2091,
0x2092, 0x2093, 0x2094, 0x2095, 0x2096, 0x2097, 0x2098, 0x2099, 0x209a, 0x209b,
0x209c, 0x2170, 0x2171, 0x2172, 0x2173, 0x2174, 0x2175, 0x2176, 0x2177, 0x2178,
0x2179, 0x217a, 0x217b, 0x217c, 0x217d, 0x217e, 0x217f, 0x24d0, 0x24d1, 0x24d2,
0x24d3, 0x24d4, 0x24d5, 0x24d6, 0x24d7, 0x24d8, 0x24d9, 0x24da, 0x24db, 0x24dc,
0x24dd, 0x24de, 0x24df, 0x24e0, 0x24e1, 0x24e2, 0x24e3, 0x24e4, 0x24e5, 0x24e6,
0x24e7, 0x24e8, 0x24e9, 0x2c7c, 0x2c7d, 0xa69c, 0xa69d, 0xa770, 0xa7bb, 0xa7bd,
0xa7bf, 0xa7c3, 0xa7c8, 0xa7ca, 0xa7f6, 0xa7f8, 0xa7f9, 0xab5c, 0xab5d, 0xab5e,
0xab5f, 0xab66, 0xab67, 0xab68, }
+# utf8proc claims 0x8be is UTF8PROC_CATEGORY_LO
+utf8proc_issue_isprintable = {0x8be, 0x8bf, 0x8c0, 0x8c1, 0x8c2, 0x8c3, 0x8c4,
0x8c5, 0x8c6, 0x8c7, 0xb55, 0xc77, 0xd04, 0xd81, 0xe86, 0xe89, 0xe8c, 0xe8e,
0xe8f, 0xe90, 0xe91, 0xe92, 0xe93, 0xe98, 0xea0, 0xea8, 0xea9, 0xeac, 0xeba,
0x1abf, 0x1ac0, 0x1cfa, 0x2b97, 0x2bc9, 0x2bff, 0x2e4f, 0x2e50, 0x2e51, 0x2e52,
0x31bb, 0x31bc, 0x31bd, 0x31be, 0x31bf, 0x32ff, 0x4db6, 0x4db7, 0x4db8, 0x4db9,
0x4dba, 0x4dbb, 0x4dbc, 0x4dbd, 0x4dbe, 0x4dbf, 0x9ff0, 0x9ff1, 0x9ff2, 0x9ff3,
0x9ff4, 0x9ff5, 0x9ff6, 0x9ff7, 0x9ff8, 0x9ff9, 0x9ffa, 0x9ffb, 0x9ffc, 0xa7ba,
0xa7bb, 0xa7bc, 0xa7bd, 0xa7be, 0xa7bf, 0xa7c2, 0xa7c3, 0xa7c4, 0xa7c5, 0xa7c6,
0xa7c7, 0xa7c8, 0xa7c9, 0xa7ca, 0xa7f5, 0xa7f6, 0xa82c, 0xab66, 0xab67, 0xab68,
0xab69, 0xab6a, 0xab6b, 0x1019c, 0x10e80, 0x10e81, 0x10e82, 0x10e83, 0x10e84,
0x10e85, 0x10e86, 0x10e87,
+ 0x10e88, 0x10e89, 0x10e8a, 0x10e8b, 0x10e8c,
0x10e8d, 0x10e8e, 0x10e8f, 0x10e90, 0x10e91, 0x10e92, 0x10e93, 0x10e94,
0x10e95, 0x10e96, 0x10e97, 0x10e98, 0x10e99, 0x10e9a, 0x10e9b, 0x10e9c,
0x10e9d, 0x10e9e, 0x10e9f, 0x10ea0, 0x10ea1, 0x10ea2, 0x10ea3, 0x10ea4,
0x10ea5, 0x10ea6, 0x10ea7, 0x10ea8, 0x10ea9, 0x10eab, 0x10eac, 0x10ead,
0x10eb0, 0x10eb1, 0x10fb0, 0x10fb1, 0x10fb2, 0x10fb3, 0x10fb4, 0x10fb5,
0x10fb6, 0x10fb7, 0x10fb8, 0x10fb9, 0x10fba, 0x10fbb, 0x10fbc, 0x10fbd,
0x10fbe, 0x10fbf, 0x10fc0, 0x10fc1, 0x10fc2, 0x10fc3, 0x10fc4, 0x10fc5,
0x10fc6, 0x10fc7, 0x10fc8, 0x10fc9, 0x10fca, 0x10fcb, 0x10fe0, 0x10fe1,
0x10fe2, 0x10fe3, 0x10fe4, 0x10fe5, 0x10fe6, 0x10fe7, 0x10fe8, 0x10fe9,
0x10fea, 0x10feb, 0x10fec, 0x10fed, 0x10fee, 0x10fef, 0x10ff0, 0x10ff1,
0x10ff2, 0x10ff3, 0x10ff4, 0x10ff5, 0x10ff6}
+# utf8proc does not store if a codepoint is numeric
+numeric_info_missing = {0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03,
0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96, 0x4ebf, 0x4ec0, 0x4edf, 0x4ee8,
0x4f0d, 0x4f70, 0x5104, 0x5146, 0x5169, 0x516b, 0x516d, 0x5341, 0x5343, 0x5344,
0x5345, 0x534c, 0x53c1, 0x53c2, 0x53c3, 0x53c4, 0x56db, 0x58f1, 0x58f9,
+ 0x5e7a, 0x5efe, 0x5eff, 0x5f0c, 0x5f0d, 0x5f0e,
0x5f10, 0x62fe, 0x634c, 0x67d2, 0x6f06, 0x7396, 0x767e, 0x8086, 0x842c, 0x8cae,
0x8cb3, 0x8d30, 0x9621, 0x9646, 0x964c, 0x9678, 0x96f6, 0xf96b, 0xf973, 0xf978,
0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, 0x10fc5, 0x10fc6, 0x10fc7, 0x10fc8, 0x10fc9,
0x10fca, 0x10fcb, }
+# utf8proc has no no digit information
+digit_info_missing = {0xb2, 0xb3, 0xb9, 0x1369, 0x136a, 0x136b, 0x136c,
0x136d, 0x136e, 0x136f, 0x1370, 0x1371, 0x19da, 0x2070, 0x2074, 0x2075, 0x2076,
0x2077, 0x2078, 0x2079, 0x2080, 0x2081, 0x2082, 0x2083, 0x2084, 0x2085, 0x2086,
0x2087, 0x2088, 0x2089, 0x2460, 0x2461, 0x2462, 0x2463, 0x2464, 0x2465, 0x2466,
0x2467, 0x2468, 0x2474, 0x2475, 0x2476, 0x2477, 0x2478, 0x2479, 0x247a, 0x247b,
0x247c, 0x2488, 0x2489, 0x248a, 0x248b, 0x248c, 0x248d,
+ 0x248e, 0x248f, 0x2490, 0x24ea, 0x24f5, 0x24f6, 0x24f7,
0x24f8, 0x24f9, 0x24fa, 0x24fb, 0x24fc, 0x24fd, 0x24ff, 0x2776, 0x2777, 0x2778,
0x2779, 0x277a, 0x277b, 0x277c, 0x277d, 0x277e, 0x2780, 0x2781, 0x2782, 0x2783,
0x2784, 0x2785, 0x2786, 0x2787, 0x2788, 0x278a, 0x278b, 0x278c, 0x278d, 0x278e,
0x278f, 0x2790, 0x2791, 0x2792, 0x10a40, 0x10a41, 0x10a42, 0x10a43, 0x10e60,
0x10e61, 0x10e62, 0x10e63, 0x10e64, 0x10e65, 0x10e66, 0x10e67, 0x10e68}
+
+codepoints_ignore = {
+ 'isalnum': numeric_info_missing | digit_info_missing |
utf8proc_issue_isalpha,
+ 'isalpha': utf8proc_issue_isalpha,
+ 'isdigit': digit_info_missing,
+ 'isupper': utf8proc_issue_isupper,
+ 'isprintable': utf8proc_issue_isprintable,
+ 'isnumeric': numeric_info_missing,
+ 'islower': utf8proc_issue_islower
+}
+
+
[email protected]('function_name', ['isalnum', 'isalpha', 'isascii',
'isdecimal', 'isdigit', 'islower', 'isnumeric', 'isprintable', 'isspace',
'isupper', ])
[email protected]('ascii', [False, True])
+def test_string_py_compat_boolean(function_name, ascii):
+ variant = 'ascii' if ascii else 'unicode'
+ arrow_name = f'string_{function_name}_{variant}'
+ py_name = function_name
+ for i in range(128 if ascii else 0x11000):
Review comment:
Yes, many of the C++ ascii tests pass in utf8 data to test if it simply
gets passed through.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]