Spark): Perform scalar data type conversion for Spark over HTTP [arrow-adbc]

via GitHub Wed, 25 Sep 2024 07:43:42 -0700


CurtHagenlocher commented on code in PR #2152:
URL: https://github.com/apache/arrow-adbc/pull/2152#discussion_r1775354213



##########
csharp/test/Drivers/Apache/Spark/StringValueTests.cs:
##########
@@ -50,11 +50,11 @@ public static IEnumerable<object[]> ByteArrayData(int size)
         [InlineData(null)]
         [InlineData("")]
         [InlineData("你好")]
-        [InlineData("String contains formatting characters tab\t, newline\n, 
carriage return\r.", "3.4.0")]
+        [InlineData("String contains formatting characters tab\t, newline\n, 
carriage return\r.", SparkServerType.Databricks)]
         [InlineData(" Leading and trailing spaces ")]
-        public async Task TestStringData(string? value, string? minVersion = 
null)
+        internal async Task TestStringData(string? value, SparkServerType? 
serverType = default)

Review Comment:
   Will these tests still run if marked as `internal`? (Some test frameworks 
require `public`.)



##########
csharp/src/Drivers/Apache/Hive2/DecimalUtility.cs:
##########
@@ -0,0 +1,452 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+using System;
+using System.Numerics;
+
+namespace Apache.Arrow.Adbc.Drivers.Apache.Hive2
+{
+    internal static class DecimalUtility
+    {
+        private const char AsciiZero = '0';
+        private const int AsciiDigitMaxIndex = '9' - AsciiZero;
+        private const char AsciiMinus = '-';
+        private const char AsciiPlus = '+';
+        private const char AsciiUpperE = 'E';
+        private const char AsciiLowerE = 'e';
+        private const char AsciiPeriod = '.';
+
+        /// <summary>
+        /// Gets the BigInteger bytes for the given string value.
+        /// </summary>
+        /// <param name="value">The numeric string value to get bytes 
for.</param>
+        /// <param name="precision">The decimal precision for the target 
Decimal[128|256]</param>
+        /// <param name="scale">The decimal scale for the target 
Decimal[128|256]</param>
+        /// <param name="byteWidth">The width in bytes for the target buffer. 
Should match the length of the bytes parameter.</param>
+        /// <param name="bytes">The buffer to place the BigInteger bytes 
into.</param>
+        /// <exception cref="ArgumentOutOfRangeException"></exception>
+        internal static void GetBytes(string value, int precision, int scale, 
int byteWidth, Span<byte> bytes)
+        {
+            if (precision < 1)
+            {
+                throw new ArgumentOutOfRangeException(nameof(precision), 
precision, "precision value must be greater than zero.");
+            }
+            if (scale < 0 || scale >= precision)
+            {
+                throw new ArgumentOutOfRangeException(nameof(scale), scale, 
"scale value must be in the range 0 .. precision.");
+            }
+            if (byteWidth > bytes.Length)
+            {
+                throw new ArgumentOutOfRangeException(nameof(byteWidth), 
byteWidth, $"value for byteWidth {byteWidth} exceeds the the size of bytes.");
+            }
+
+            BigInteger intergerValue = ToBigInteger(value, precision, scale);
+
+            FillBytes(bytes, intergerValue, byteWidth);
+        }
+
+        private static void FillBytes(Span<byte> bytes, BigInteger 
integerValue, int byteWidth)
+        {
+            int bytesWritten = 0;
+#if NETCOREAPP
+            if (!integerValue.TryWriteBytes(bytes, out bytesWritten, false, 
!BitConverter.IsLittleEndian))
+            {
+                throw new OverflowException("Could not extract bytes from 
integer value " + integerValue);
+            }
+#else
+            byte[] tempBytes = integerValue.ToByteArray();
+            bytesWritten = tempBytes.Length;
+            if (bytesWritten > bytes.Length)
+            {
+                throw new OverflowException($"Decimal size greater than 
{byteWidth} bytes: {bytesWritten}");
+            }
+            tempBytes.CopyTo(bytes);
+#endif
+            byte fillByte = (byte)(integerValue < 0 ? 255 : 0);
+            for (int i = bytesWritten; i < byteWidth; i++)
+            {
+                bytes[i] = fillByte;
+            }
+        }
+
+        private static BigInteger ToBigInteger(string value, int precision, 
int scale)
+        {
+            BigInteger intergerValue;
+#if NETCOREAPP
+            ReadOnlySpan<char> significantValue = GetSignificantValue(value, 
precision, scale);
+            intergerValue = BigInteger.Parse(significantValue);
+#else
+            ReadOnlySpan<char> significantValue = 
GetSignificantValue(value.AsSpan(), precision, scale);
+            intergerValue = BigInteger.Parse(significantValue.ToString());
+#endif
+            return intergerValue;
+        }
+
+        private static ReadOnlySpan<char> 
GetSignificantValue(ReadOnlySpan<char> value, int precision, int scale)
+        {
+            ParseDecimal(value, out ParserState state);
+
+            ProcessDecimal(value,
+                precision,
+                scale,
+                state,
+                out char sign,
+                out ReadOnlySpan<char> integerSpan,
+                out ReadOnlySpan<char> fractionalSpan,
+                out int neededScale);
+
+            Span<char> significant = new char[precision + 1];
+            BuildSignificantValue(
+                sign,
+                scale,
+                integerSpan,
+                fractionalSpan,
+                neededScale,
+                significant);
+
+            return significant;
+        }
+
+        private static void ProcessDecimal(ReadOnlySpan<char> value, int 
precision, int scale, ParserState state, out char sign, out ReadOnlySpan<char> 
integerSpan, out ReadOnlySpan<char> fractionalSpan, out int neededScale)
+        {
+            int int_length = 0;
+            int frac_length = 0;
+            int exponent = 0;
+
+            if (state.IntegerStart != -1 && state.IntegerEnd != -1) int_length 
= state.IntegerEnd - state.IntegerStart + 1;
+            if (state.FractionalStart != -1 && state.FractionalEnd != -1) 
frac_length = state.FractionalEnd - state.FractionalStart + 1;
+            if (state.ExponentIndex != -1 && state.ExponentStart != -1 && 
state.ExponentEnd != -1 && state.ExponentEnd >= state.ExponentStart)
+            {
+                int expStart = state.ExpSignIndex != -1 ? state.ExpSignIndex : 
state.ExponentStart;
+                int expLength = state.ExponentEnd - expStart + 1;
+                ReadOnlySpan<char> exponentSpan = value.Slice(expStart, 
expLength);
+#if NETCOREAPP
+                exponent = int.Parse(exponentSpan);
+#else
+                exponent = int.Parse(exponentSpan.ToString());
+#endif
+            }
+            integerSpan = int_length > 0 ? value.Slice(state.IntegerStart, 
state.IntegerEnd - state.IntegerStart + 1) : [];
+            fractionalSpan = frac_length > 0 ? 
value.Slice(state.FractionalStart, state.FractionalEnd - state.FractionalStart 
+ 1) : [];
+            Span<char> tempSignificant;
+            if (exponent != 0)
+            {
+                tempSignificant = new char[int_length + frac_length];
+                if (int_length > 0) value.Slice(state.IntegerStart, 
state.IntegerEnd - state.IntegerStart + 1).CopyTo(tempSignificant.Slice(0));
+                if (frac_length > 0) value.Slice(state.FractionalStart, 
state.FractionalEnd - state.FractionalStart + 
1).CopyTo(tempSignificant.Slice(int_length));
+                // Trim trailing zeros from combined string
+                while (tempSignificant[tempSignificant.Length - 1] == 
AsciiZero)
+                {
+                    tempSignificant = tempSignificant.Slice(0, 
tempSignificant.Length - 1);
+                }
+                // Recalculate integer and fractional length
+                if (exponent > 0)
+                {
+                    int_length = Math.Min(int_length + exponent, 
tempSignificant.Length);
+                    frac_length = Math.Max(Math.Min(frac_length - exponent, 
tempSignificant.Length - int_length), 0);
+                }
+                else
+                {
+                    int_length = Math.Max(int_length + exponent, 0);
+                    frac_length = Math.Max(Math.Min(frac_length - exponent, 
tempSignificant.Length - int_length), 0);
+                }
+                // Reset the integer and fractional span
+                integerSpan = tempSignificant.Slice(0, int_length);
+                fractionalSpan = tempSignificant.Slice(int_length, 
frac_length);
+            }
+
+            int neededPrecision = int_length + frac_length;
+            neededScale = frac_length;
+            if (neededPrecision > precision)
+            {
+                throw new OverflowException($"Decimal precision cannot be 
greater than that in the Arrow vector: {value.ToString()} has precision > 
{precision}");
+            }
+            if (neededScale > scale)
+            {
+                throw new OverflowException($"Decimal scale cannot be greater 
than that in the Arrow vector: {value.ToString()} has scale > {scale}");
+            }
+            sign = state.SignIndex != -1 ? value[state.SignIndex] : AsciiPlus;
+        }
+
+        private static void BuildSignificantValue(
+            char sign,
+            int scale,
+            ReadOnlySpan<char> integerSpan,
+            ReadOnlySpan<char> fractionalSpan,
+            int neededScale,
+            Span<char> significant)
+        {
+            significant[0] = sign;
+            int end = 0;
+            integerSpan.CopyTo(significant.Slice(end + 1));
+            end += integerSpan.Length;
+            fractionalSpan.CopyTo(significant.Slice(end + 1));
+            end += fractionalSpan.Length;
+
+            // Add trailing zeros to adjust for scale
+            while (neededScale < scale)
+            {
+                neededScale++;
+                end++;
+                significant[end] = AsciiZero;
+            }
+        }
+
+        private enum ParseState
+        {
+            StartWhiteSpace,
+            SignOrDigitOrDecimal,
+            DigitOrDecimalOrExponent,
+            FractionOrExponent,
+            ExpSignOrExpValue,
+            ExpValue,
+            EndWhiteSpace,
+            Invalid,
+        }
+
+        private struct ParserState
+        {
+            public ParseState CurrentState = ParseState.StartWhiteSpace;
+            public int SignIndex = -1;
+            public int IntegerStart = -1;
+            public int IntegerEnd = -1;
+            public int DecimalIndex = -1;
+            public int FractionalStart = -1;
+            public int FractionalEnd = -1;
+            public int ExponentIndex = -1;
+            public int ExpSignIndex = -1;
+            public int ExponentStart = -1;
+            public int ExponentEnd = -1;
+            public bool HasZero = false;
+
+            public ParserState() { }
+        }
+
+        private static void ParseDecimal(ReadOnlySpan<char> value, out 
ParserState parserState)
+        {
+            ParserState state = new ParserState();
+            int index = 0;
+            int length = value.Length;
+            while (index < length)
+            {
+                char c = value[index];
+                switch (state.CurrentState)
+                {
+                    case ParseState.StartWhiteSpace:
+                        if (!char.IsWhiteSpace(c))
+                        {
+                            state.CurrentState = 
ParseState.SignOrDigitOrDecimal;
+                        }
+                        else
+                        {
+                            index++;
+                        }
+                        break;
+                    case ParseState.SignOrDigitOrDecimal:
+                        // Is Ascii Numeric
+                        if ((uint)(c - AsciiZero) <= AsciiDigitMaxIndex)
+                        {
+                            if (!state.HasZero && c == AsciiZero) 
state.HasZero |= true;
+                            state.IntegerStart = index;
+                            state.IntegerEnd = index;
+                            index++;
+                            state.CurrentState = 
ParseState.DigitOrDecimalOrExponent;
+                        }
+                        else if (c == AsciiMinus || c == AsciiPlus)
+                        {
+                            state.SignIndex = index;
+                            index++;
+                            state.CurrentState = 
ParseState.DigitOrDecimalOrExponent;
+                        }
+                        else if (c == AsciiPeriod)
+                        {
+                            state.DecimalIndex = index;
+                            index++;
+                            state.CurrentState = ParseState.FractionOrExponent;
+                        }
+                        else if (char.IsWhiteSpace(c))
+                        {
+                            index++;
+                            state.CurrentState = ParseState.EndWhiteSpace;
+                        }
+                        else
+                        {
+                            state.CurrentState = ParseState.Invalid;
+                        }
+                        break;
+                    case ParseState.DigitOrDecimalOrExponent:
+                        // Is Ascii Numeric
+                        if ((uint)(c - AsciiZero) <= AsciiDigitMaxIndex)
+                        {
+                            if (state.IntegerStart == -1) state.IntegerStart = 
index;
+                            if (!state.HasZero && c == AsciiZero) 
state.HasZero |= true;
+                            state.IntegerEnd = index;
+                            index++;
+                        }
+                        else if (c == AsciiPeriod)
+                        {
+                            state.DecimalIndex = index;
+                            index++;
+                            state.CurrentState = ParseState.FractionOrExponent;
+                        }
+                        else if (c == AsciiUpperE || c == AsciiLowerE)
+                        {
+                            state.ExponentIndex = index;
+                            index++;
+                            state.CurrentState = ParseState.ExpSignOrExpValue;
+                        }
+                        else if (char.IsWhiteSpace(c))
+                        {
+                            index++;
+                            state.CurrentState = ParseState.EndWhiteSpace;
+                        }
+                        else
+                        {
+                            state.CurrentState = ParseState.Invalid;
+                        }
+                        break;
+                    case ParseState.FractionOrExponent:
+                        // Is Ascii Numeric
+                        if ((uint)(c - AsciiZero) <= AsciiDigitMaxIndex)
+                        {
+                            if (state.FractionalStart == -1) 
state.FractionalStart = index;
+                            if (!state.HasZero && c == AsciiZero) 
state.HasZero |= true;
+                            state.FractionalEnd = index;
+                            index++;
+                        }
+                        else if (c == AsciiUpperE || c == AsciiLowerE)
+                        {
+                            state.ExponentIndex = index;
+                            index++;
+                            state.CurrentState = ParseState.ExpSignOrExpValue;
+                        }
+                        else if (char.IsWhiteSpace(c))
+                        {
+                            index++;
+                            state.CurrentState = ParseState.EndWhiteSpace;
+                        }
+                        else
+                        {
+                            state.CurrentState = ParseState.Invalid;
+                        }
+                        break;
+                    case ParseState.ExpSignOrExpValue:
+                        // Is Ascii Numeric
+                        if ((uint)(c - AsciiZero) <= AsciiDigitMaxIndex)
+                        {
+                            if (state.ExponentStart == -1) state.ExponentStart 
= index;
+                            state.ExponentEnd = index;
+                            index++;
+                            state.CurrentState = ParseState.ExpValue;
+                        }
+                        else if (c == AsciiMinus || c == AsciiPlus)
+                        {
+                            state.ExpSignIndex = index;
+                            index++;
+                            state.CurrentState = ParseState.ExpValue;
+                        }
+                        else if (char.IsWhiteSpace(c))
+                        {
+                            index++;
+                            state.CurrentState = ParseState.EndWhiteSpace;
+                        }
+                        else
+                        {
+                            state.CurrentState = ParseState.Invalid;
+                        }
+                        break;
+                    case ParseState.ExpValue:
+                        // Is Ascii Numeric
+                        if ((uint)(c - AsciiZero) <= AsciiDigitMaxIndex)
+                        {
+                            if (state.ExponentStart == -1) state.ExponentStart 
= index;
+                            state.ExponentEnd = index;
+                            index++;
+                        }
+                        else if (char.IsWhiteSpace(c))
+                        {
+                            index++;
+                            state.CurrentState = ParseState.EndWhiteSpace;
+                        }
+                        else
+                        {
+                            state.CurrentState = ParseState.Invalid;
+                        }
+                        break;
+                    case ParseState.EndWhiteSpace:
+                        if (char.IsWhiteSpace(c))
+                        {
+                            index++;
+                            state.CurrentState = ParseState.EndWhiteSpace;
+                        }
+                        else
+                        {
+                            state.CurrentState = ParseState.Invalid;
+                        }
+                        break;
+                    case ParseState.Invalid:
+                        throw new ArgumentOutOfRangeException(nameof(value), 
value.ToString(), $"Invalid numeric value at index {index}.");
+                }
+            }
+            // Trim leading zeros from integer porttion

Review Comment:
   nit: typo in `porttion`



##########
csharp/src/Drivers/Apache/Hive2/HiveServer2Reader.cs:
##########
@@ -78,6 +112,166 @@ static IArrowArray GetArray(TColumn column)
                 (IArrowArray?)column.StringVal?.Values ??
                 (IArrowArray?)column.BinaryVal?.Values ??
                 throw new InvalidOperationException("unsupported data type");
+            if (expectedArrowType != null && arrowArray is StringArray 
stringArray && s_arrowStringConverters.ContainsKey(expectedArrowType.TypeId))
+            {
+                // Perform a conversion from string to native/scalar type.
+                Func<StringArray, IArrowType, IArrowArray> converter = 
s_arrowStringConverters[expectedArrowType.TypeId];
+                return converter(stringArray, expectedArrowType);
+            }
+            return arrowArray;
+        }
+
+        private static Date32Array ConvertToDate32(StringArray array, 
IArrowType _)
+        {
+            var resultArray = new Date32Array.Builder();
+            foreach (string item in (IReadOnlyCollection<string>)array)
+            {
+                if (item == null)
+                {
+                    resultArray.AppendNull();
+                    continue;
+                }
+
+                ReadOnlySpan<char> date = item.AsSpan();
+                bool isKnownFormat = date.Length >= 8 && date[4] == AsciiDash 
&& date[7] == AsciiDash;
+                if (isKnownFormat)
+                {
+                    DateTime value = ConverToDateTime(date);
+                    resultArray.Append(value);
+                }
+                else
+                {
+                    resultArray.Append(DateTime.Parse(item, 
CultureInfo.InvariantCulture));
+                }
+            }
+
+            return resultArray.Build();
+        }
+
+        private static DateTime ConverToDateTime(ReadOnlySpan<char> date)

Review Comment:
   nit: typo in `ConverToDateTime`



##########
csharp/test/Drivers/Apache/Hive2/DecimalUtilityTests.cs:
##########
@@ -0,0 +1,173 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+using System;
+using System.Collections.Generic;
+using System.Data.SqlTypes;
+using System.Diagnostics;
+using System.Globalization;
+using Apache.Arrow.Adbc.Drivers.Apache.Hive2;
+using Xunit;
+using Xunit.Abstractions;
+
+namespace Apache.Arrow.Adbc.Tests.Drivers.Apache.Hive2
+{
+    /// <summary>
+    /// Class for testing the Decimal Utilities tests.
+    /// </summary>
+    public class DecimalUtilityTests(ITestOutputHelper outputHelper)
+    {
+        private readonly ITestOutputHelper _outputHelper = outputHelper;
+
+        [SkippableTheory]
+        [MemberData(nameof(Decimal128Data))]
+        public void TestCanConvertDecimal(string value, int precision, int 
scale, int byteWidth, byte[] expected, SqlDecimal? expectedDecimal = default)
+        {
+            byte[] actual = new byte[byteWidth];
+            DecimalUtility.GetBytes(value, precision, scale, byteWidth, 
actual);
+            Assert.Equal(expected, actual);
+            Assert.Equal(0, byteWidth % 4);
+            int[] buffer = new int[byteWidth / 4];
+            for (int i = 0; i < buffer.Length; i++)
+            {
+                buffer[i] = BitConverter.ToInt32(actual, i * sizeof(int));
+            }
+            SqlDecimal actualDecimal = GetSqlDecimal128(actual, 0, precision, 
scale);
+            if (expectedDecimal != null) Assert.Equal(expectedDecimal, 
actualDecimal);
+        }
+
+        [Fact(Skip = "Run manually to confirm equivalent performance")]
+        public void TestConvertDecimalPerformance()
+        {
+            Stopwatch stopwatch = new();
+
+            int testCount = 1000000;
+            string testValue = "99999999999999999999999999999999999999";
+            int byteWidth = 16;
+            byte[] buffer = new byte[byteWidth];
+            Decimal128Array.Builder builder = new Decimal128Array.Builder(new 
Types.Decimal128Type(38, 0));
+            stopwatch.Restart();
+            for (int i = 0; i < testCount; i++)
+            {
+                if (decimal.TryParse(testValue, NumberStyles.Float, 
NumberFormatInfo.InvariantInfo, out var actualDecimal))
+                {
+                    builder.Append(new SqlDecimal(actualDecimal));
+                }
+                else
+                {
+                    builder.Append(testValue);
+                }
+            }
+            stopwatch.Stop();
+            _outputHelper.WriteLine($"Decimal128Builder.Append: {testCount} 
iterations took {stopwatch.ElapsedMilliseconds} elapsed milliseconds");
+
+            stopwatch.Restart();
+            for (int i = 0; i < testCount; i++)
+            {
+                DecimalUtility.GetBytes(testValue, 38, 0, byteWidth, buffer);
+                builder.Append(buffer);
+            }
+            stopwatch.Stop();
+            _outputHelper.WriteLine($"DecimalUtility.GetBytes: {testCount} 
iterations took {stopwatch.ElapsedMilliseconds} elapsed milliseconds");
+

Review Comment:
   nit: remove extra blank line



##########
csharp/test/Drivers/Apache/Hive2/DecimalUtilityTests.cs:
##########
@@ -0,0 +1,173 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+using System;
+using System.Collections.Generic;
+using System.Data.SqlTypes;
+using System.Diagnostics;
+using System.Globalization;
+using Apache.Arrow.Adbc.Drivers.Apache.Hive2;
+using Xunit;
+using Xunit.Abstractions;
+
+namespace Apache.Arrow.Adbc.Tests.Drivers.Apache.Hive2
+{
+    /// <summary>
+    /// Class for testing the Decimal Utilities tests.
+    /// </summary>
+    public class DecimalUtilityTests(ITestOutputHelper outputHelper)
+    {
+        private readonly ITestOutputHelper _outputHelper = outputHelper;
+
+        [SkippableTheory]
+        [MemberData(nameof(Decimal128Data))]
+        public void TestCanConvertDecimal(string value, int precision, int 
scale, int byteWidth, byte[] expected, SqlDecimal? expectedDecimal = default)
+        {
+            byte[] actual = new byte[byteWidth];
+            DecimalUtility.GetBytes(value, precision, scale, byteWidth, 
actual);
+            Assert.Equal(expected, actual);
+            Assert.Equal(0, byteWidth % 4);
+            int[] buffer = new int[byteWidth / 4];
+            for (int i = 0; i < buffer.Length; i++)
+            {
+                buffer[i] = BitConverter.ToInt32(actual, i * sizeof(int));
+            }
+            SqlDecimal actualDecimal = GetSqlDecimal128(actual, 0, precision, 
scale);
+            if (expectedDecimal != null) Assert.Equal(expectedDecimal, 
actualDecimal);
+        }
+
+        [Fact(Skip = "Run manually to confirm equivalent performance")]
+        public void TestConvertDecimalPerformance()
+        {
+            Stopwatch stopwatch = new();

Review Comment:
   As an aside, I highly recommend Benchmark.NET for any kind of performance 
testing.



##########
csharp/src/Drivers/Apache/Hive2/DecimalUtility.cs:
##########
@@ -0,0 +1,452 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+using System;
+using System.Numerics;
+
+namespace Apache.Arrow.Adbc.Drivers.Apache.Hive2
+{
+    internal static class DecimalUtility
+    {
+        private const char AsciiZero = '0';
+        private const int AsciiDigitMaxIndex = '9' - AsciiZero;
+        private const char AsciiMinus = '-';
+        private const char AsciiPlus = '+';
+        private const char AsciiUpperE = 'E';
+        private const char AsciiLowerE = 'e';
+        private const char AsciiPeriod = '.';
+
+        /// <summary>
+        /// Gets the BigInteger bytes for the given string value.
+        /// </summary>
+        /// <param name="value">The numeric string value to get bytes 
for.</param>
+        /// <param name="precision">The decimal precision for the target 
Decimal[128|256]</param>
+        /// <param name="scale">The decimal scale for the target 
Decimal[128|256]</param>
+        /// <param name="byteWidth">The width in bytes for the target buffer. 
Should match the length of the bytes parameter.</param>
+        /// <param name="bytes">The buffer to place the BigInteger bytes 
into.</param>
+        /// <exception cref="ArgumentOutOfRangeException"></exception>
+        internal static void GetBytes(string value, int precision, int scale, 
int byteWidth, Span<byte> bytes)
+        {
+            if (precision < 1)
+            {
+                throw new ArgumentOutOfRangeException(nameof(precision), 
precision, "precision value must be greater than zero.");
+            }
+            if (scale < 0 || scale >= precision)
+            {
+                throw new ArgumentOutOfRangeException(nameof(scale), scale, 
"scale value must be in the range 0 .. precision.");
+            }
+            if (byteWidth > bytes.Length)
+            {
+                throw new ArgumentOutOfRangeException(nameof(byteWidth), 
byteWidth, $"value for byteWidth {byteWidth} exceeds the the size of bytes.");
+            }
+
+            BigInteger intergerValue = ToBigInteger(value, precision, scale);

Review Comment:
   nit: typo in `intergerValue` appears six times in this file



##########
csharp/test/Drivers/Apache/Spark/NumericValueTests.cs:
##########
@@ -261,7 +262,8 @@ public async Task TestFloatValuesInsertSelectDelete(float 
value)
             string valueString = ConvertFloatToString(value);
             await InsertSingleValueAsync(table.TableName, columnName, 
valueString);
             object doubleValue = (double)value;
-            object floatValue = 
TestEnvironment.GetValueForProtocolVersion(doubleValue, value)!;
+            // Spark over HTTP returns float as double whereas Spark on 
Databricks returns float.

Review Comment:
   Seems weird :/



##########
csharp/src/Drivers/Apache/Hive2/DecimalUtility.cs:
##########
@@ -0,0 +1,452 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+using System;
+using System.Numerics;
+
+namespace Apache.Arrow.Adbc.Drivers.Apache.Hive2
+{
+    internal static class DecimalUtility
+    {
+        private const char AsciiZero = '0';
+        private const int AsciiDigitMaxIndex = '9' - AsciiZero;
+        private const char AsciiMinus = '-';
+        private const char AsciiPlus = '+';
+        private const char AsciiUpperE = 'E';
+        private const char AsciiLowerE = 'e';
+        private const char AsciiPeriod = '.';
+
+        /// <summary>
+        /// Gets the BigInteger bytes for the given string value.
+        /// </summary>
+        /// <param name="value">The numeric string value to get bytes 
for.</param>
+        /// <param name="precision">The decimal precision for the target 
Decimal[128|256]</param>
+        /// <param name="scale">The decimal scale for the target 
Decimal[128|256]</param>
+        /// <param name="byteWidth">The width in bytes for the target buffer. 
Should match the length of the bytes parameter.</param>
+        /// <param name="bytes">The buffer to place the BigInteger bytes 
into.</param>
+        /// <exception cref="ArgumentOutOfRangeException"></exception>
+        internal static void GetBytes(string value, int precision, int scale, 
int byteWidth, Span<byte> bytes)
+        {
+            if (precision < 1)
+            {
+                throw new ArgumentOutOfRangeException(nameof(precision), 
precision, "precision value must be greater than zero.");
+            }
+            if (scale < 0 || scale >= precision)
+            {
+                throw new ArgumentOutOfRangeException(nameof(scale), scale, 
"scale value must be in the range 0 .. precision.");
+            }
+            if (byteWidth > bytes.Length)
+            {
+                throw new ArgumentOutOfRangeException(nameof(byteWidth), 
byteWidth, $"value for byteWidth {byteWidth} exceeds the the size of bytes.");
+            }
+
+            BigInteger intergerValue = ToBigInteger(value, precision, scale);
+
+            FillBytes(bytes, intergerValue, byteWidth);
+        }
+
+        private static void FillBytes(Span<byte> bytes, BigInteger 
integerValue, int byteWidth)
+        {
+            int bytesWritten = 0;
+#if NETCOREAPP
+            if (!integerValue.TryWriteBytes(bytes, out bytesWritten, false, 
!BitConverter.IsLittleEndian))
+            {
+                throw new OverflowException("Could not extract bytes from 
integer value " + integerValue);
+            }
+#else
+            byte[] tempBytes = integerValue.ToByteArray();
+            bytesWritten = tempBytes.Length;
+            if (bytesWritten > bytes.Length)
+            {
+                throw new OverflowException($"Decimal size greater than 
{byteWidth} bytes: {bytesWritten}");
+            }
+            tempBytes.CopyTo(bytes);
+#endif
+            byte fillByte = (byte)(integerValue < 0 ? 255 : 0);
+            for (int i = bytesWritten; i < byteWidth; i++)
+            {
+                bytes[i] = fillByte;
+            }
+        }
+
+        private static BigInteger ToBigInteger(string value, int precision, 
int scale)
+        {
+            BigInteger intergerValue;
+#if NETCOREAPP
+            ReadOnlySpan<char> significantValue = GetSignificantValue(value, 
precision, scale);
+            intergerValue = BigInteger.Parse(significantValue);
+#else
+            ReadOnlySpan<char> significantValue = 
GetSignificantValue(value.AsSpan(), precision, scale);
+            intergerValue = BigInteger.Parse(significantValue.ToString());
+#endif
+            return intergerValue;
+        }
+
+        private static ReadOnlySpan<char> 
GetSignificantValue(ReadOnlySpan<char> value, int precision, int scale)
+        {
+            ParseDecimal(value, out ParserState state);
+
+            ProcessDecimal(value,
+                precision,
+                scale,
+                state,
+                out char sign,
+                out ReadOnlySpan<char> integerSpan,
+                out ReadOnlySpan<char> fractionalSpan,
+                out int neededScale);
+
+            Span<char> significant = new char[precision + 1];
+            BuildSignificantValue(
+                sign,
+                scale,
+                integerSpan,
+                fractionalSpan,
+                neededScale,
+                significant);
+
+            return significant;
+        }
+
+        private static void ProcessDecimal(ReadOnlySpan<char> value, int 
precision, int scale, ParserState state, out char sign, out ReadOnlySpan<char> 
integerSpan, out ReadOnlySpan<char> fractionalSpan, out int neededScale)
+        {
+            int int_length = 0;
+            int frac_length = 0;
+            int exponent = 0;
+
+            if (state.IntegerStart != -1 && state.IntegerEnd != -1) int_length 
= state.IntegerEnd - state.IntegerStart + 1;
+            if (state.FractionalStart != -1 && state.FractionalEnd != -1) 
frac_length = state.FractionalEnd - state.FractionalStart + 1;
+            if (state.ExponentIndex != -1 && state.ExponentStart != -1 && 
state.ExponentEnd != -1 && state.ExponentEnd >= state.ExponentStart)
+            {
+                int expStart = state.ExpSignIndex != -1 ? state.ExpSignIndex : 
state.ExponentStart;
+                int expLength = state.ExponentEnd - expStart + 1;
+                ReadOnlySpan<char> exponentSpan = value.Slice(expStart, 
expLength);
+#if NETCOREAPP
+                exponent = int.Parse(exponentSpan);
+#else
+                exponent = int.Parse(exponentSpan.ToString());

Review Comment:
   I was thinking it was super-annoying that there are no Parse methods for 
`int` that take `ReadOnlySpan` in net472. Then I remembered that there are some 
Parse methods in System.Memory. Unfortunately, they're all for UTF-8. But then 
I remembered that these strings are actually stored as UTF-8 anyway and we have 
to pay to convert them to .NET `string`. If we would instead reference the 
underlying string data as UTF-8 `ReadOnlySpan`s, we can use the `TryParse` 
methods on `System.Buffers.Text.Utf8Parser` which include `int`, `long` and 
`decimal` support and for which the `decimal` implementation does allow 
exponents.
   
   (Could be done as a followup.)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat(csharp/src/Drivers/Apache/Spark): Perform scalar data type conversion for Spark over HTTP [arrow-adbc]

Reply via email to