```python
>>> tbl.shape
(29701056, 373)
>>> Counter([field.type for field in tbl.schema])
Counter({TimestampType(timestamp[us]): 1,
DataType(double): 370,
DictionaryType(dictionary<values=int64, indices=int32, ordered=0>): 1,
DictionaryType(dictionary<values=string, indices=int32, ordered=0>):
1})
>>> df = tbl.to_pandas()
>>> np.asarray([df.iloc[:, col].nunique() for col in range(1, 371)]) # double
>>> columns
array([ 5503, 5762, 5672, 5156, 6385, 6576, 5173, 5055, 311,
51, 1125, 961, 6202, 5862, 4919, 5358, 2939, 2927,
3380, 3451, 2297, 1676, 2292, 2482, 2518, 2347, 6551,
2651, 2693, 2966, 2972, 2892, 2942, 2441, 2540, 2550,
2439, 3078, 3127, 2832, 3449, 2291, 3087, 3218, 1694,
1656, 1952, 1784, 2879, 1, 1, 482, 1, 1,
593, 1, 1, 140, 1, 1, 1413, 1, 1,
24, 303, 324, 1314, 8816, 4986, 308, 318, 196,
207, 189, 187, 1800, 1091, 764, 931, 608, 403,
3520, 1284, 267, 597, 1262, 779, 318, 959, 2013,
1158, 1161, 668, 716, 830, 1104, 1124, 515, 506,
296, 2369, 11296, 295, 245, 15, 3192, 1001, 4281,
1428, 1536, 761, 423, 417, 1735, 3631, 521, 627,
920, 1148, 1792, 1842, 1723, 1878, 4419, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1719, 1, 95, 5294,
6499, 1035, 277, 1, 16847, 1, 18471, 1, 1,
1, 1, 1, 15995, 1, 645, 15382, 500, 9802,
2201, 1, 2581, 1635, 2039, 2042, 1583, 2602, 36498,
5988, 6027, 5080, 22053, 49031, 27200, 4003, 7591, 24653,
25501, 15233, 7058, 3660, 5685, 17246, 737, 343, 24883,
14109, 2685, 1, 22599, 1, 1, 1, 1, 1,
1152, 975, 1030, 986, 1, 1518, 1553, 1433, 1470,
28, 1248, 1547, 1624, 1555, 1601, 1602, 1, 1041,
725, 60, 1145, 944, 231, 303, 19, 318, 300,
314, 370, 1277, 118, 737, 29, 26, 500, 229,
232, 238, 246, 1248, 198, 403, 403, 245, 1393,
581, 402, 420, 488, 510, 480, 407, 813, 836,
836, 1309, 1121, 2784, 2732, 1580, 425, 465, 459,
448, 417, 375, 531, 561, 1021, 506, 1501, 501,
987, 1632, 464, 2259, 1058, 1653, 421, 562, 1312,
526, 937, 708, 1001, 911, 978, 1554, 387, 984,
1261, 1261, 1110, 1681, 2281, 1055, 305, 4196, 1305,
631])
>>> # column names are longish strings
>>> np.asarray(df.columns.map(len))
array([ 8, 30, 30, 30, 30, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 32, 32,
29, 29, 40, 40, 30, 30, 30, 30, 30, 30, 30, 31, 31, 29, 29, 29, 29,
27, 27, 27, 27, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 18, 18,
18, 18, 18, 18, 17, 17, 17, 18, 18, 18, 18, 18, 18, 25, 23, 24, 26,
23, 27, 30, 30, 25, 25, 25, 25, 26, 26, 23, 23, 27, 22, 22, 28, 25,
23, 24, 26, 26, 27, 27, 23, 23, 25, 25, 23, 26, 25, 24, 24, 24, 27,
22, 28, 28, 33, 33, 38, 36, 33, 29, 31, 38, 38, 29, 35, 35, 35, 35,
35, 35, 35, 35, 35, 30, 54, 54, 55, 55, 55, 54, 54, 54, 56, 56, 56,
45, 45, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 46, 52, 52, 52,
52, 52, 53, 53, 53, 50, 50, 55, 55, 53, 53, 53, 54, 54, 54, 55, 55,
55, 54, 55, 55, 55, 54, 54, 54, 56, 56, 56, 53, 53, 53, 53, 53, 54,
54, 54, 54, 54, 54, 52, 52, 52, 52, 52, 53, 53, 53, 50, 50, 55, 55,
53, 53, 53, 54, 54, 54, 55, 55, 55, 88, 89, 70, 75, 83, 82, 72, 73,
60, 61, 60, 61, 58, 59, 60, 61, 60, 61, 42, 56, 65, 65, 64, 65, 65,
63, 63, 62, 63, 63, 61, 61, 60, 61, 61, 59, 59, 58, 59, 59, 55, 55,
54, 54, 54, 53, 52, 55, 56, 56, 55, 56, 56, 56, 56, 55, 56, 56, 28,
28, 28, 28, 31, 30, 30, 30, 30, 28, 27, 27, 27, 27, 27, 27, 24, 29,
29, 29, 25, 25, 24, 24, 30, 27, 28, 28, 28, 24, 25, 24, 31, 31, 29,
29, 29, 29, 29, 29, 27, 32, 32, 32, 26, 28, 30, 30, 30, 30, 30, 30,
30, 30, 30, 34, 34, 28, 28, 26, 35, 35, 35, 35, 35, 35, 49, 43, 44,
39, 43, 42, 39, 41, 43, 37, 36, 39, 44, 47, 43, 43, 43, 44, 45, 45,
45, 38, 48, 33, 49, 49, 43, 38, 35, 44, 34, 38, 38, 42, 8, 12],
dtype=int64)
>>> # there is no meaningful index
>>> df.index
RangeIndex(start=0, stop=29701056, step=1)
```
[ Full content available at: https://github.com/apache/arrow/issues/2614 ]
This message was relayed via gitbox.apache.org for [email protected]