This is an automated email from the ASF dual-hosted git repository.
curth pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new fe38d47e82 GH-41047: [C#] Address performance issue of reading from
StringArray (#41048)
fe38d47e82 is described below
commit fe38d47e82c6ffc231f0918be15aa5cd0da98607
Author: keshen-msft <[email protected]>
AuthorDate: Mon Apr 8 13:14:31 2024 -0700
GH-41047: [C#] Address performance issue of reading from StringArray
(#41048)
### Rationale for this change
The motivation here is to address
https://github.com/apache/arrow/issues/41047. There is severe performance
drawback in reading a StringArray as value array of a DictionaryArray, because
of repeated and unnecessary UTF 8 string decoding.
### What changes are included in this PR?
- Added a new function Materialize() to materialize the values to a list.
When materialized, GetString() reads from the vector directly.
- Added test coverage.
### Are these changes tested?
Yes
### Are there any user-facing changes?
No. This change maintains backwards compatibility on the API surface. It is
up to the client application to decide whether to materialize the array and
gain performance.
* GitHub Issue: #41047
Authored-by: Keshuang Shen <[email protected]>
Signed-off-by: Curt Hagenlocher <[email protected]>
---
csharp/src/Apache.Arrow/Arrays/StringArray.cs | 60 +++++++++++++++++++++-
csharp/test/Apache.Arrow.Tests/StringArrayTests.cs | 31 +++++++++++
2 files changed, 90 insertions(+), 1 deletion(-)
diff --git a/csharp/src/Apache.Arrow/Arrays/StringArray.cs
b/csharp/src/Apache.Arrow/Arrays/StringArray.cs
index af77fe1b1a..a3ec596adc 100644
--- a/csharp/src/Apache.Arrow/Arrays/StringArray.cs
+++ b/csharp/src/Apache.Arrow/Arrays/StringArray.cs
@@ -13,12 +13,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-using Apache.Arrow.Types;
using System;
using System.Collections;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using System.Text;
+using Apache.Arrow.Types;
namespace Apache.Arrow
{
@@ -26,6 +26,8 @@ namespace Apache.Arrow
{
public static readonly Encoding DefaultEncoding = Encoding.UTF8;
+ private Dictionary<Encoding, string[]> materializedStringStore;
+
public new class Builder : BuilderBase<StringArray, Builder>
{
public Builder() : base(StringType.Default) { }
@@ -71,16 +73,28 @@ namespace Apache.Arrow
public override void Accept(IArrowArrayVisitor visitor) =>
Accept(this, visitor);
+ /// <summary>
+ /// Get the string value at the given index
+ /// </summary>
+ /// <param name="index">Input index</param>
+ /// <param name="encoding">Optional: the string encoding, default is
UTF8</param>
+ /// <returns>The string object at the given index</returns>
public string GetString(int index, Encoding encoding = default)
{
encoding ??= DefaultEncoding;
+ if (materializedStringStore != null &&
materializedStringStore.TryGetValue(encoding, out string[] materializedStrings))
+ {
+ return materializedStrings[index];
+ }
+
ReadOnlySpan<byte> bytes = GetBytes(index, out bool isNull);
if (isNull)
{
return null;
}
+
if (bytes.Length == 0)
{
return string.Empty;
@@ -93,6 +107,50 @@ namespace Apache.Arrow
}
}
+ /// <summary>
+ /// Materialize the array for the given encoding to accelerate the
string access
+ /// </summary>
+ /// <param name="encoding">Optional: the string encoding, default is
UTF8</param>
+ /// <remarks>This method is not thread safe when it is called in
parallel with <see cref="GetString(int, Encoding)"/> or <see
cref="Materialize(Encoding)"/>.</remarks>
+ public void Materialize(Encoding encoding = default)
+ {
+ encoding ??= DefaultEncoding;
+
+ if (IsMaterialized(encoding))
+ {
+ return;
+ }
+
+ if (materializedStringStore == null)
+ {
+ materializedStringStore = new Dictionary<Encoding, string[]>();
+ }
+
+ var stringStore = new string[Length];
+ for (int i = 0; i < Length; i++)
+ {
+ stringStore[i] = GetString(i, encoding);
+ }
+
+ materializedStringStore[encoding] = stringStore;
+ }
+
+ /// <summary>
+ /// Check if the array has been materialized for the given encoding
+ /// </summary>
+ /// <param name="encoding">Optional: the string encoding, default is
UTF8</param>
+ /// <returns>True of false whether the array has been
materialized</returns>
+ public bool IsMaterialized(Encoding encoding = default)
+ {
+ if (materializedStringStore == null)
+ {
+ return false;
+ }
+
+ encoding ??= DefaultEncoding;
+ return materializedStringStore.ContainsKey(encoding);
+ }
+
int IReadOnlyCollection<string>.Count => Length;
string IReadOnlyList<string>.this[int index] => GetString(index);
diff --git a/csharp/test/Apache.Arrow.Tests/StringArrayTests.cs
b/csharp/test/Apache.Arrow.Tests/StringArrayTests.cs
index 0fd3d3d105..b19731535a 100644
--- a/csharp/test/Apache.Arrow.Tests/StringArrayTests.cs
+++ b/csharp/test/Apache.Arrow.Tests/StringArrayTests.cs
@@ -49,6 +49,37 @@ namespace Apache.Arrow.Tests
// Assert
Assert.Equal(firstValue, retrievedValue);
}
+
+ [Theory]
+ [InlineData(null, null)]
+ [InlineData(null, "")]
+ [InlineData(null, "value")]
+ [InlineData("", null)]
+ [InlineData("", "")]
+ [InlineData("", "value")]
+ [InlineData("value", null)]
+ [InlineData("value", "")]
+ [InlineData("value", "value")]
+ public void ReturnsAppendedValueMaterialize(string firstValue,
string secondValue)
+ {
+ // Arrange
+ // Create an array with two elements. The second element being
null,
+ // empty, or non-empty may influence the underlying BinaryArray
+ // storage such that retrieving an empty first element could
result
+ // in an empty span or a 0-length span backed by storage.
+ var array = new StringArray.Builder()
+ .Append(firstValue)
+ .Append(secondValue)
+ .Build();
+
+ // Act
+ array.Materialize();
+ var retrievedValue = array.GetString(0);
+
+ // Assert
+ Assert.True(array.IsMaterialized());
+ Assert.Equal(firstValue, retrievedValue);
+ }
}
}
}