wesm opened a new pull request #8671: URL: https://github.com/apache/arrow/pull/8671
This issue was pointed out to me by some folks at UWisc and just getting around to trying out a fix. This produces a substantial performance benefit in numerical comparisons with gcc8 ``` benchmark baseline contender change % counters 5 GreaterArrayArrayInt64/32768/0 414.735m items/sec 1.525b items/sec 267.679 {'iterations': 8792, 'null_percent': 0.0} 2 GreaterArrayArrayInt64/32768/1 411.275m items/sec 1.505b items/sec 266.053 {'iterations': 8676, 'null_percent': 100.0} 16 GreaterArrayArrayInt64/32768/100 406.580m items/sec 1.487b items/sec 265.846 {'iterations': 8639, 'null_percent': 1.0} 20 GreaterArrayArrayInt64/32768/2 411.188m items/sec 1.498b items/sec 264.320 {'iterations': 8700, 'null_percent': 50.0} 6 GreaterArrayArrayInt64/32768/10000 407.791m items/sec 1.458b items/sec 257.431 {'iterations': 8777, 'null_percent': 0.01} 17 GreaterArrayArrayInt64/32768/10 410.239m items/sec 1.438b items/sec 250.545 {'iterations': 8663, 'null_percent': 10.0} 11 GreaterArrayScalarInt64/32768/0 638.903m items/sec 1.882b items/sec 194.540 {'iterations': 13433, 'null_percent': 0.0} 14 GreaterArrayScalarInt64/32768/2 635.268m items/sec 1.866b items/sec 193.770 {'iterations': 13468, 'null_percent': 50.0} 9 GreaterArrayScalarInt64/32768/100 629.207m items/sec 1.838b items/sec 192.130 {'iterations': 13268, 'null_percent': 1.0} 15 GreaterArrayScalarInt64/32768/10 632.650m items/sec 1.848b items/sec 192.115 {'iterations': 13321, 'null_percent': 10.0} 3 GreaterArrayScalarInt64/32768/1 636.101m items/sec 1.858b items/sec 192.093 {'iterations': 13465, 'null_percent': 100.0} 19 GreaterArrayScalarInt64/32768/10000 630.470m items/sec 1.835b items/sec 191.010 {'iterations': 13336, 'null_percent': 0.01} 12 GreaterArrayArrayString/32768/1 295.949m items/sec 302.807m items/sec 2.317 {'iterations': 6323, 'null_percent': 100.0} 21 GreaterArrayScalarString/32768/2 274.259m items/sec 278.502m items/sec 1.547 {'iterations': 5847, 'null_percent': 50.0} 22 GreaterArrayScalarString/32768/1 1.007b items/sec 1.020b items/sec 1.244 {'iterations': 21309, 'null_percent': 100.0} 8 GreaterArrayArrayString/32768/2 148.096m items/sec 146.908m items/sec -0.802 {'iterations': 3172, 'null_percent': 50.0} 10 GreaterArrayArrayString/32768/100 104.447m items/sec 103.111m items/sec -1.280 {'iterations': 2231, 'null_percent': 1.0} 13 GreaterArrayArrayString/32768/10 105.158m items/sec 103.790m items/sec -1.300 {'iterations': 2246, 'null_percent': 10.0} 7 GreaterArrayArrayString/32768/10000 104.408m items/sec 102.958m items/sec -1.388 {'iterations': 2225, 'null_percent': 0.01} 4 GreaterArrayArrayString/32768/0 105.920m items/sec 103.817m items/sec -1.985 {'iterations': 2263, 'null_percent': 0.0} 18 GreaterArrayScalarString/32768/10 645.713m items/sec 614.437m items/sec -4.844 {'iterations': 13876, 'null_percent': 10.0} 1 GreaterArrayScalarString/32768/10000 977.632m items/sec 871.382m items/sec -10.868 {'iterations': 20892, 'null_percent': 0.01} 0 GreaterArrayScalarString/32768/0 999.585m items/sec 877.238m items/sec -12.240 {'iterations': 21302, 'null_percent': 0.0} 23 GreaterArrayScalarString/32768/100 934.922m items/sec 814.268m items/sec -12.905 {'iterations': 19892, 'null_percent': 1.0} ``` The difference is less pronounced with clang-8, but still present ``` benchmark baseline contender change % counters 18 GreaterArrayArrayInt64/32768/2 928.755m items/sec 1.655b items/sec 78.196 {'iterations': 19762, 'null_percent': 50.0} 6 GreaterArrayArrayInt64/32768/1 938.947m items/sec 1.666b items/sec 77.386 {'iterations': 19962, 'null_percent': 100.0} 4 GreaterArrayArrayInt64/32768/0 945.091m items/sec 1.658b items/sec 75.386 {'iterations': 20198, 'null_percent': 0.0} 0 GreaterArrayArrayInt64/32768/10 923.876m items/sec 1.610b items/sec 74.241 {'iterations': 19667, 'null_percent': 10.0} 23 GreaterArrayArrayInt64/32768/100 918.441m items/sec 1.598b items/sec 73.962 {'iterations': 19602, 'null_percent': 1.0} 2 GreaterArrayArrayInt64/32768/10000 914.122m items/sec 1.573b items/sec 72.085 {'iterations': 19437, 'null_percent': 0.01} 21 GreaterArrayScalarString/32768/2 243.973m items/sec 246.087m items/sec 0.866 {'iterations': 5224, 'null_percent': 50.0} 17 GreaterArrayScalarInt64/32768/1 2.736b items/sec 2.638b items/sec -3.590 {'iterations': 58090, 'null_percent': 100.0} 9 GreaterArrayArrayString/32768/0 136.733m items/sec 130.784m items/sec -4.351 {'iterations': 2905, 'null_percent': 0.0} 22 GreaterArrayArrayString/32768/100 136.434m items/sec 127.740m items/sec -6.372 {'iterations': 2918, 'null_percent': 1.0} 19 GreaterArrayArrayString/32768/10000 136.639m items/sec 127.704m items/sec -6.539 {'iterations': 2916, 'null_percent': 0.01} 15 GreaterArrayScalarInt64/32768/10000 2.693b items/sec 2.512b items/sec -6.742 {'iterations': 58669, 'null_percent': 0.01} 3 GreaterArrayScalarInt64/32768/0 2.795b items/sec 2.592b items/sec -7.248 {'iterations': 58721, 'null_percent': 0.0} 13 GreaterArrayArrayString/32768/10 139.225m items/sec 128.587m items/sec -7.641 {'iterations': 2980, 'null_percent': 10.0} 11 GreaterArrayScalarInt64/32768/2 2.759b items/sec 2.537b items/sec -8.040 {'iterations': 57925, 'null_percent': 50.0} 14 GreaterArrayScalarInt64/32768/10 2.767b items/sec 2.517b items/sec -9.045 {'iterations': 58391, 'null_percent': 10.0} 1 GreaterArrayScalarInt64/32768/100 2.738b items/sec 2.485b items/sec -9.246 {'iterations': 58094, 'null_percent': 1.0} 16 GreaterArrayArrayString/32768/1 794.894m items/sec 659.684m items/sec -17.010 {'iterations': 17020, 'null_percent': 100.0} 12 GreaterArrayArrayString/32768/2 182.007m items/sec 150.059m items/sec -17.553 {'iterations': 3905, 'null_percent': 50.0} 7 GreaterArrayScalarString/32768/1 1.093b items/sec 772.486m items/sec -29.297 {'iterations': 23230, 'null_percent': 100.0} 5 GreaterArrayScalarString/32768/10 691.609m items/sec 473.463m items/sec -31.542 {'iterations': 14691, 'null_percent': 10.0} 20 GreaterArrayScalarString/32768/100 1.002b items/sec 603.447m items/sec -39.761 {'iterations': 21340, 'null_percent': 1.0} 10 GreaterArrayScalarString/32768/0 1.054b items/sec 626.234m items/sec -40.574 {'iterations': 22461, 'null_percent': 0.0} 8 GreaterArrayScalarString/32768/10000 1.078b items/sec 605.930m items/sec -43.796 {'iterations': 23251, 'null_percent': 0.01} ``` I don't know why this seems to cause performance to be worse in the string comparison cases. There might be some other things to try here like bit-packing in larger batches. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org