This is an automated email from the ASF dual-hosted git repository. raulcd pushed a commit to branch maint-16.0.0 in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 3a4962dd8c71da6bbbfb2467b739c2aed574a624 Author: keshen-msft <[email protected]> AuthorDate: Mon Apr 8 13:14:31 2024 -0700 GH-41047: [C#] Address performance issue of reading from StringArray (#41048) ### Rationale for this change The motivation here is to address https://github.com/apache/arrow/issues/41047. There is severe performance drawback in reading a StringArray as value array of a DictionaryArray, because of repeated and unnecessary UTF 8 string decoding. ### What changes are included in this PR? - Added a new function Materialize() to materialize the values to a list. When materialized, GetString() reads from the vector directly. - Added test coverage. ### Are these changes tested? Yes ### Are there any user-facing changes? No. This change maintains backwards compatibility on the API surface. It is up to the client application to decide whether to materialize the array and gain performance. * GitHub Issue: #41047 Authored-by: Keshuang Shen <[email protected]> Signed-off-by: Curt Hagenlocher <[email protected]> --- csharp/src/Apache.Arrow/Arrays/StringArray.cs | 60 +++++++++++++++++++++- csharp/test/Apache.Arrow.Tests/StringArrayTests.cs | 31 +++++++++++ 2 files changed, 90 insertions(+), 1 deletion(-) diff --git a/csharp/src/Apache.Arrow/Arrays/StringArray.cs b/csharp/src/Apache.Arrow/Arrays/StringArray.cs index af77fe1b1a..a3ec596adc 100644 --- a/csharp/src/Apache.Arrow/Arrays/StringArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/StringArray.cs @@ -13,12 +13,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -using Apache.Arrow.Types; using System; using System.Collections; using System.Collections.Generic; using System.Runtime.InteropServices; using System.Text; +using Apache.Arrow.Types; namespace Apache.Arrow { @@ -26,6 +26,8 @@ namespace Apache.Arrow { public static readonly Encoding DefaultEncoding = Encoding.UTF8; + private Dictionary<Encoding, string[]> materializedStringStore; + public new class Builder : BuilderBase<StringArray, Builder> { public Builder() : base(StringType.Default) { } @@ -71,16 +73,28 @@ namespace Apache.Arrow public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor); + /// <summary> + /// Get the string value at the given index + /// </summary> + /// <param name="index">Input index</param> + /// <param name="encoding">Optional: the string encoding, default is UTF8</param> + /// <returns>The string object at the given index</returns> public string GetString(int index, Encoding encoding = default) { encoding ??= DefaultEncoding; + if (materializedStringStore != null && materializedStringStore.TryGetValue(encoding, out string[] materializedStrings)) + { + return materializedStrings[index]; + } + ReadOnlySpan<byte> bytes = GetBytes(index, out bool isNull); if (isNull) { return null; } + if (bytes.Length == 0) { return string.Empty; @@ -93,6 +107,50 @@ namespace Apache.Arrow } } + /// <summary> + /// Materialize the array for the given encoding to accelerate the string access + /// </summary> + /// <param name="encoding">Optional: the string encoding, default is UTF8</param> + /// <remarks>This method is not thread safe when it is called in parallel with <see cref="GetString(int, Encoding)"/> or <see cref="Materialize(Encoding)"/>.</remarks> + public void Materialize(Encoding encoding = default) + { + encoding ??= DefaultEncoding; + + if (IsMaterialized(encoding)) + { + return; + } + + if (materializedStringStore == null) + { + materializedStringStore = new Dictionary<Encoding, string[]>(); + } + + var stringStore = new string[Length]; + for (int i = 0; i < Length; i++) + { + stringStore[i] = GetString(i, encoding); + } + + materializedStringStore[encoding] = stringStore; + } + + /// <summary> + /// Check if the array has been materialized for the given encoding + /// </summary> + /// <param name="encoding">Optional: the string encoding, default is UTF8</param> + /// <returns>True of false whether the array has been materialized</returns> + public bool IsMaterialized(Encoding encoding = default) + { + if (materializedStringStore == null) + { + return false; + } + + encoding ??= DefaultEncoding; + return materializedStringStore.ContainsKey(encoding); + } + int IReadOnlyCollection<string>.Count => Length; string IReadOnlyList<string>.this[int index] => GetString(index); diff --git a/csharp/test/Apache.Arrow.Tests/StringArrayTests.cs b/csharp/test/Apache.Arrow.Tests/StringArrayTests.cs index 0fd3d3d105..b19731535a 100644 --- a/csharp/test/Apache.Arrow.Tests/StringArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/StringArrayTests.cs @@ -49,6 +49,37 @@ namespace Apache.Arrow.Tests // Assert Assert.Equal(firstValue, retrievedValue); } + + [Theory] + [InlineData(null, null)] + [InlineData(null, "")] + [InlineData(null, "value")] + [InlineData("", null)] + [InlineData("", "")] + [InlineData("", "value")] + [InlineData("value", null)] + [InlineData("value", "")] + [InlineData("value", "value")] + public void ReturnsAppendedValueMaterialize(string firstValue, string secondValue) + { + // Arrange + // Create an array with two elements. The second element being null, + // empty, or non-empty may influence the underlying BinaryArray + // storage such that retrieving an empty first element could result + // in an empty span or a 0-length span backed by storage. + var array = new StringArray.Builder() + .Append(firstValue) + .Append(secondValue) + .Build(); + + // Act + array.Materialize(); + var retrievedValue = array.GetString(0); + + // Assert + Assert.True(array.IsMaterialized()); + Assert.Equal(firstValue, retrievedValue); + } } } }
