[GitHub] [arrow] dhirschfeld commented on issue #2614: Performance Question

GitHub Wed, 03 Oct 2018 23:06:04 -0700

```python
>>> tbl.shape
(29701056, 373)
>>> Counter([field.type for field in tbl.schema])
Counter({TimestampType(timestamp[us]): 1,
         DataType(double): 370,
         DictionaryType(dictionary<values=int64, indices=int32, ordered=0>): 1,
         DictionaryType(dictionary<values=string, indices=int32, ordered=0>): 
1})
>>> df = tbl.to_pandas()
>>> np.asarray([df.iloc[:, col].nunique() for col in range(1, 371)]) # double 
>>> columns
array([ 5503,  5762,  5672,  5156,  6385,  6576,  5173,  5055,   311,
          51,  1125,   961,  6202,  5862,  4919,  5358,  2939,  2927,
        3380,  3451,  2297,  1676,  2292,  2482,  2518,  2347,  6551,
        2651,  2693,  2966,  2972,  2892,  2942,  2441,  2540,  2550,
        2439,  3078,  3127,  2832,  3449,  2291,  3087,  3218,  1694,
        1656,  1952,  1784,  2879,     1,     1,   482,     1,     1,
         593,     1,     1,   140,     1,     1,  1413,     1,     1,
          24,   303,   324,  1314,  8816,  4986,   308,   318,   196,
         207,   189,   187,  1800,  1091,   764,   931,   608,   403,
        3520,  1284,   267,   597,  1262,   779,   318,   959,  2013,
        1158,  1161,   668,   716,   830,  1104,  1124,   515,   506,
         296,  2369, 11296,   295,   245,    15,  3192,  1001,  4281,
        1428,  1536,   761,   423,   417,  1735,  3631,   521,   627,
         920,  1148,  1792,  1842,  1723,  1878,  4419,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,  1719,     1,    95,  5294,
        6499,  1035,   277,     1, 16847,     1, 18471,     1,     1,
           1,     1,     1, 15995,     1,   645, 15382,   500,  9802,
        2201,     1,  2581,  1635,  2039,  2042,  1583,  2602, 36498,
        5988,  6027,  5080, 22053, 49031, 27200,  4003,  7591, 24653,
       25501, 15233,  7058,  3660,  5685, 17246,   737,   343, 24883,
       14109,  2685,     1, 22599,     1,     1,     1,     1,     1,
        1152,   975,  1030,   986,     1,  1518,  1553,  1433,  1470,
          28,  1248,  1547,  1624,  1555,  1601,  1602,     1,  1041,
         725,    60,  1145,   944,   231,   303,    19,   318,   300,
         314,   370,  1277,   118,   737,    29,    26,   500,   229,
         232,   238,   246,  1248,   198,   403,   403,   245,  1393,
         581,   402,   420,   488,   510,   480,   407,   813,   836,
         836,  1309,  1121,  2784,  2732,  1580,   425,   465,   459,
         448,   417,   375,   531,   561,  1021,   506,  1501,   501,
         987,  1632,   464,  2259,  1058,  1653,   421,   562,  1312,
         526,   937,   708,  1001,   911,   978,  1554,   387,   984,
        1261,  1261,  1110,  1681,  2281,  1055,   305,  4196,  1305,
         631])
>>> # column names are longish strings
>>> np.asarray(df.columns.map(len))
array([ 8, 30, 30, 30, 30, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 32, 32,
       29, 29, 40, 40, 30, 30, 30, 30, 30, 30, 30, 31, 31, 29, 29, 29, 29,
       27, 27, 27, 27, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 18, 18,
       18, 18, 18, 18, 17, 17, 17, 18, 18, 18, 18, 18, 18, 25, 23, 24, 26,
       23, 27, 30, 30, 25, 25, 25, 25, 26, 26, 23, 23, 27, 22, 22, 28, 25,
       23, 24, 26, 26, 27, 27, 23, 23, 25, 25, 23, 26, 25, 24, 24, 24, 27,
       22, 28, 28, 33, 33, 38, 36, 33, 29, 31, 38, 38, 29, 35, 35, 35, 35,
       35, 35, 35, 35, 35, 30, 54, 54, 55, 55, 55, 54, 54, 54, 56, 56, 56,
       45, 45, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 46, 52, 52, 52,
       52, 52, 53, 53, 53, 50, 50, 55, 55, 53, 53, 53, 54, 54, 54, 55, 55,
       55, 54, 55, 55, 55, 54, 54, 54, 56, 56, 56, 53, 53, 53, 53, 53, 54,
       54, 54, 54, 54, 54, 52, 52, 52, 52, 52, 53, 53, 53, 50, 50, 55, 55,
       53, 53, 53, 54, 54, 54, 55, 55, 55, 88, 89, 70, 75, 83, 82, 72, 73,
       60, 61, 60, 61, 58, 59, 60, 61, 60, 61, 42, 56, 65, 65, 64, 65, 65,
       63, 63, 62, 63, 63, 61, 61, 60, 61, 61, 59, 59, 58, 59, 59, 55, 55,
       54, 54, 54, 53, 52, 55, 56, 56, 55, 56, 56, 56, 56, 55, 56, 56, 28,
       28, 28, 28, 31, 30, 30, 30, 30, 28, 27, 27, 27, 27, 27, 27, 24, 29,
       29, 29, 25, 25, 24, 24, 30, 27, 28, 28, 28, 24, 25, 24, 31, 31, 29,
       29, 29, 29, 29, 29, 27, 32, 32, 32, 26, 28, 30, 30, 30, 30, 30, 30,
       30, 30, 30, 34, 34, 28, 28, 26, 35, 35, 35, 35, 35, 35, 49, 43, 44,
       39, 43, 42, 39, 41, 43, 37, 36, 39, 44, 47, 43, 43, 43, 44, 45, 45,
       45, 38, 48, 33, 49, 49, 43, 38, 35, 44, 34, 38, 38, 42,  8, 12],
      dtype=int64)
>>> # there is no meaningful index
>>> df.index
RangeIndex(start=0, stop=29701056, step=1)
```


[ Full content available at: https://github.com/apache/arrow/issues/2614 ]
This message was relayed via gitbox.apache.org for [email protected]

[GitHub] [arrow] dhirschfeld commented on issue #2614: Performance Question

Reply via email to