This is an automated email from the ASF dual-hosted git repository.

curth pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new fe38d47e82 GH-41047: [C#] Address performance issue of reading from 
StringArray (#41048)
fe38d47e82 is described below

commit fe38d47e82c6ffc231f0918be15aa5cd0da98607
Author: keshen-msft <[email protected]>
AuthorDate: Mon Apr 8 13:14:31 2024 -0700

    GH-41047: [C#] Address performance issue of reading from StringArray 
(#41048)
    
    
    
    ### Rationale for this change
    
    The motivation here is to address 
https://github.com/apache/arrow/issues/41047. There is severe performance 
drawback in reading a StringArray as value array of a DictionaryArray, because 
of repeated and unnecessary UTF 8 string decoding.
    
    ### What changes are included in this PR?
    
    - Added a new function Materialize() to materialize the values to a list. 
When materialized, GetString() reads from the vector directly.
    - Added test coverage.
    
    ### Are these changes tested?
    
    Yes
    
    ### Are there any user-facing changes?
    
    No. This change maintains backwards compatibility on the API surface. It is 
up to the client application to decide whether to materialize the array and 
gain performance.
    
    * GitHub Issue: #41047
    
    Authored-by: Keshuang Shen <[email protected]>
    Signed-off-by: Curt Hagenlocher <[email protected]>
---
 csharp/src/Apache.Arrow/Arrays/StringArray.cs      | 60 +++++++++++++++++++++-
 csharp/test/Apache.Arrow.Tests/StringArrayTests.cs | 31 +++++++++++
 2 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/csharp/src/Apache.Arrow/Arrays/StringArray.cs 
b/csharp/src/Apache.Arrow/Arrays/StringArray.cs
index af77fe1b1a..a3ec596adc 100644
--- a/csharp/src/Apache.Arrow/Arrays/StringArray.cs
+++ b/csharp/src/Apache.Arrow/Arrays/StringArray.cs
@@ -13,12 +13,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-using Apache.Arrow.Types;
 using System;
 using System.Collections;
 using System.Collections.Generic;
 using System.Runtime.InteropServices;
 using System.Text;
+using Apache.Arrow.Types;
 
 namespace Apache.Arrow
 {
@@ -26,6 +26,8 @@ namespace Apache.Arrow
     {
         public static readonly Encoding DefaultEncoding = Encoding.UTF8;
 
+        private Dictionary<Encoding, string[]> materializedStringStore;
+
         public new class Builder : BuilderBase<StringArray, Builder>
         {
             public Builder() : base(StringType.Default) { }
@@ -71,16 +73,28 @@ namespace Apache.Arrow
 
         public override void Accept(IArrowArrayVisitor visitor) => 
Accept(this, visitor);
 
+        /// <summary>
+        /// Get the string value at the given index
+        /// </summary>
+        /// <param name="index">Input index</param>
+        /// <param name="encoding">Optional: the string encoding, default is 
UTF8</param>
+        /// <returns>The string object at the given index</returns>
         public string GetString(int index, Encoding encoding = default)
         {
             encoding ??= DefaultEncoding;
 
+            if (materializedStringStore != null && 
materializedStringStore.TryGetValue(encoding, out string[] materializedStrings))
+            {
+                return materializedStrings[index];
+            }
+
             ReadOnlySpan<byte> bytes = GetBytes(index, out bool isNull);
 
             if (isNull)
             {
                 return null;
             }
+
             if (bytes.Length == 0)
             {
                 return string.Empty;
@@ -93,6 +107,50 @@ namespace Apache.Arrow
             }
         }
 
+        /// <summary>
+        /// Materialize the array for the given encoding to accelerate the 
string access
+        /// </summary>
+        /// <param name="encoding">Optional: the string encoding, default is 
UTF8</param>
+        /// <remarks>This method is not thread safe when it is called in 
parallel with <see cref="GetString(int, Encoding)"/> or <see 
cref="Materialize(Encoding)"/>.</remarks>
+        public void Materialize(Encoding encoding = default)
+        {
+            encoding ??= DefaultEncoding;
+
+            if (IsMaterialized(encoding))
+            {
+                return;
+            }
+
+            if (materializedStringStore == null)
+            {
+                materializedStringStore = new Dictionary<Encoding, string[]>();
+            }
+
+            var stringStore = new string[Length];
+            for (int i = 0; i < Length; i++)
+            {
+                stringStore[i] = GetString(i, encoding);
+            }
+
+            materializedStringStore[encoding] = stringStore;
+        }
+
+        /// <summary>
+        /// Check if the array has been materialized for the given encoding
+        /// </summary>
+        /// <param name="encoding">Optional: the string encoding, default is 
UTF8</param>
+        /// <returns>True of false whether the array has been 
materialized</returns>
+        public bool IsMaterialized(Encoding encoding = default)
+        {
+            if (materializedStringStore == null)
+            {
+                return false;
+            }
+
+            encoding ??= DefaultEncoding;
+            return materializedStringStore.ContainsKey(encoding);
+        }
+
         int IReadOnlyCollection<string>.Count => Length;
 
         string IReadOnlyList<string>.this[int index] => GetString(index);
diff --git a/csharp/test/Apache.Arrow.Tests/StringArrayTests.cs 
b/csharp/test/Apache.Arrow.Tests/StringArrayTests.cs
index 0fd3d3d105..b19731535a 100644
--- a/csharp/test/Apache.Arrow.Tests/StringArrayTests.cs
+++ b/csharp/test/Apache.Arrow.Tests/StringArrayTests.cs
@@ -49,6 +49,37 @@ namespace Apache.Arrow.Tests
                 // Assert
                 Assert.Equal(firstValue, retrievedValue);
             }
+
+            [Theory]
+            [InlineData(null, null)]
+            [InlineData(null, "")]
+            [InlineData(null, "value")]
+            [InlineData("", null)]
+            [InlineData("", "")]
+            [InlineData("", "value")]
+            [InlineData("value", null)]
+            [InlineData("value", "")]
+            [InlineData("value", "value")]
+            public void ReturnsAppendedValueMaterialize(string firstValue, 
string secondValue)
+            {
+                // Arrange
+                // Create an array with two elements. The second element being 
null,
+                // empty, or non-empty may influence the underlying BinaryArray
+                // storage such that retrieving an empty first element could 
result
+                // in an empty span or a 0-length span backed by storage.
+                var array = new StringArray.Builder()
+                    .Append(firstValue)
+                    .Append(secondValue)
+                    .Build();
+
+                // Act
+                array.Materialize();
+                var retrievedValue = array.GetString(0);
+
+                // Assert
+                Assert.True(array.IsMaterialized());
+                Assert.Equal(firstValue, retrievedValue);
+            }
         }
     }
 }

Reply via email to