caldempsey opened a new issue, #448:
URL: https://github.com/apache/arrow-go/issues/448

   ### Describe the bug, including details regarding any error messages, 
version, and platform.
   
   `array.RecordFromJSON` exhibits exponentially slower performance compared to 
`array.NewJSONReader` when processing JSON arrays. For a 20MB JSON array 
containing ~10,000 records, RecordFromJSON takes 2-3 minutes while 
NewJSONReader completes in 2-3 seconds.
   
   This makes RecordFromJSON effectively unusable for production workloads 
involving JSON arrays larger than a few MB. Users are forced to use the more 
complex NewJSONReader API with manual lifecycle management to achieve 
acceptable performance. This is something I ran into on my own before as a 
contributor to the Spark Connect Go project, and found the fix nestled deep in 
its test suite. 
   
   The issue appears to be in how RecordFromJSON processes large arrays 
internally - possibly due to excessive allocations, inefficient buffering, or 
lack of optimization for array inputs. 
   
   Whatever it is I'm not even sure why the util exists or doesn't just proxy 
to `NewJSONReader` in this state. Sort of makes us look bad! My recommendation 
would be to deprecate the method, and point users to NewJSONReader, because 
when searching for a way to get from JSON to an Arrow Record it's one of the 
first things that crops up in the IDE.
   
   ## Benchmark Results
   
   ```sh
   Benchmarking JSON Array Processing
   ==================================
   
   Test size: 1000 records (~2.94 MB)
   RecordFromJSON:           5.389702167s (rows: 1000)
   JSONReader (chunked):      15.466959ms (rows: 1000)
   JSONReader (single chunk):   13.73025ms (rows: 1000)
   
   Speedup vs RecordFromJSON:
     JSONReader (chunked):      348.47x
     JSONReader (single chunk): 392.54x
   
   Test size: 5000 records (~14.73 MB)
   RecordFromJSON:           2m24.649038958s (rows: 5000)
   JSONReader (chunked):      82.727792ms (rows: 5000)
   JSONReader (single chunk):  68.501333ms (rows: 5000)
   
   Speedup vs RecordFromJSON:
     JSONReader (chunked):      1748.49x
     JSONReader (single chunk): 2111.62x
   
   Test size: 10000 records (~29.46 MB)
   RecordFromJSON:           9m32.107715042s (rows: 10000)
   JSONReader (chunked):     192.408416ms (rows: 10000)
   JSONReader (single chunk):    137.111ms (rows: 10000)
   
   Speedup vs RecordFromJSON:
     JSONReader (chunked):      2973.40x
     JSONReader (single chunk): 4172.59x
   ```
   
   ## Benchmark code
   
   ```go
   package main
   
   import (
        "bytes"
        "encoding/json"
        "fmt"
        "time"
   
        "github.com/apache/arrow/go/v18/arrow"
        "github.com/apache/arrow/go/v18/arrow/array"
        "github.com/apache/arrow/go/v18/arrow/memory"
   )
   
   func generateJSONArray(numRecords int) []byte {
        records := make([]map[string]interface{}, numRecords)
        for i := 0; i < numRecords; i++ {
                records[i] = map[string]interface{}{
                        "id":       i,
                        "name":     fmt.Sprintf("record_%d", i),
                        "value":    float64(i) * 1.5,
                        "active":   i%2 == 0,
                        "metadata": fmt.Sprintf("metadata_%d_%s", i, 
string(make([]byte, 500))),
                }
        }
   
        data, _ := json.Marshal(records)
        return data
   }
   
   func benchmarkRecordFromJSON(data []byte, schema *arrow.Schema) 
(time.Duration, int64) {
        pool := memory.NewGoAllocator()
   
        start := time.Now()
        record, _, err := array.RecordFromJSON(pool, schema, 
bytes.NewReader(data))
        duration := time.Since(start)
   
        if err != nil {
                panic(err)
        }
   
        numRows := record.NumRows()
        record.Release()
   
        return duration, numRows
   }
   
   // Convert JSON array to NDJSON format for JSONReader
   func jsonArrayToNDJSON(data []byte) ([]byte, error) {
        var records []json.RawMessage
        if err := json.Unmarshal(data, &records); err != nil {
                return nil, err
        }
   
        var buf bytes.Buffer
        for _, record := range records {
                buf.Write(record)
                buf.WriteByte('\n')
        }
        return buf.Bytes(), nil
   }
   
   func benchmarkJSONReader(data []byte, schema *arrow.Schema) (time.Duration, 
int64) {
        pool := memory.NewGoAllocator()
   
        // Convert JSON array to NDJSON
        ndjsonData, err := jsonArrayToNDJSON(data)
        if err != nil {
                panic(err)
        }
   
        start := time.Now()
   
        rdr := array.NewJSONReader(bytes.NewReader(ndjsonData), schema,
                array.WithAllocator(pool))
        defer rdr.Release()
   
        var totalRows int64
        for rdr.Next() {
                rec := rdr.Record()
                totalRows += rec.NumRows()
        }
   
        if err := rdr.Err(); err != nil {
                panic(err)
        }
   
        duration := time.Since(start)
        return duration, totalRows
   }
   
   // Alternative approach using a single large chunk
   func benchmarkJSONReaderSingleChunk(data []byte, schema *arrow.Schema) 
(time.Duration, int64) {
        pool := memory.NewGoAllocator()
   
        // Convert JSON array to NDJSON
        ndjsonData, err := jsonArrayToNDJSON(data)
        if err != nil {
                panic(err)
        }
   
        start := time.Now()
   
        rdr := array.NewJSONReader(bytes.NewReader(ndjsonData), schema,
                array.WithAllocator(pool),
                array.WithChunk(-1)) // Read all at once
        defer rdr.Release()
   
        if !rdr.Next() {
                panic("no record found")
        }
   
        rec := rdr.Record()
        numRows := rec.NumRows()
   
        duration := time.Since(start)
        return duration, numRows
   }
   
   func main() {
        schema := arrow.NewSchema([]arrow.Field{
                {Name: "id", Type: arrow.PrimitiveTypes.Int64},
                {Name: "name", Type: arrow.BinaryTypes.String},
                {Name: "value", Type: arrow.PrimitiveTypes.Float64},
                {Name: "active", Type: arrow.FixedWidthTypes.Boolean},
                {Name: "metadata", Type: arrow.BinaryTypes.String},
        }, nil)
   
        testSizes := []int{1000, 5000, 10000}
   
        fmt.Println("Benchmarking JSON Array Processing")
        fmt.Println("==================================")
   
        for _, size := range testSizes {
                data := generateJSONArray(size)
                dataSizeMB := float64(len(data)) / (1024 * 1024)
                fmt.Printf("\nTest size: %d records (~%.2f MB)\n", size, 
dataSizeMB)
   
                // Benchmark RecordFromJSON
                duration1, rows1 := benchmarkRecordFromJSON(data, schema)
                fmt.Printf("RecordFromJSON:           %12v (rows: %d)\n", 
duration1, rows1)
   
                // Benchmark JSONReader (default chunking)
                duration2, rows2 := benchmarkJSONReader(data, schema)
                fmt.Printf("JSONReader (chunked):     %12v (rows: %d)\n", 
duration2, rows2)
   
                // Benchmark JSONReader (single chunk)
                duration3, rows3 := benchmarkJSONReaderSingleChunk(data, schema)
                fmt.Printf("JSONReader (single chunk): %12v (rows: %d)\n", 
duration3, rows3)
   
                fmt.Printf("\nSpeedup vs RecordFromJSON:\n")
                fmt.Printf("  JSONReader (chunked):      %.2fx\n", 
float64(duration1)/float64(duration2))
                fmt.Printf("  JSONReader (single chunk): %.2fx\n", 
float64(duration1)/float64(duration3))
        }
   }
   ```
   
   Environment: Apache Arrow Go v18, Go 1.22
   
   **Note**: The performance difference becomes more pronounced with larger 
datasets. For production workloads processing 50K+ records, RecordFromJSON 
becomes effectively unusable.
   
   ### Component(s)
   
   Benchmarking


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@arrow.apache.org.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to