caldempsey commented on issue #30:
URL: https://github.com/apache/arrow-go/issues/30#issuecomment-3071145315
@loicalleyne nice work, but even though it gets the schema looking right it
fails a test of
```
package main
import (
"bytes"
"fmt"
"log"
"os"
"github.com/apache/arrow-go/v18/arrow"
"github.com/apache/arrow-go/v18/arrow/array"
"github.com/apache/arrow-go/v18/arrow/memory"
"github.com/apache/arrow-go/v18/parquet/pqarrow"
"github.com/loicalleyne/bodkin"
)
func main() {
// Complex nested JSON data (NDJSON format)
// Note: Changed phone country to string format to avoid type issues
jsonData := `{"id": 1, "user": {"name": "Alice Johnson", "age": 30,
"contact": {"email": "[email protected]", "phone": {"country": "+1", "number":
"555-0123"}, "address": {"street": "123 Main St", "city": "NYC", "state": "NY",
"coordinates": {"latitude": 40.7128, "longitude": -74.0060}}}}, "employment":
{"company": {"name": "TechCorp", "industry": "Software", "location":
{"building": "Tower A", "floor": 15}}, "position": {"title": "Senior Engineer",
"department": "Backend", "level": 4}, "compensation": {"base": 150000, "bonus":
{"target": 20000, "multiplier": 1.5}, "equity": {"vested": 5000, "unvested":
15000}}}, "projects": [{"id": "proj-001", "name": "API Gateway", "status":
"completed", "metrics": {"linesOfCode": 25000, "coverage": 87.5}}, {"id":
"proj-002", "name": "Data Pipeline", "status": "in-progress", "metrics":
{"linesOfCode": 12000, "coverage": 92.1}}]}
{"id": 2, "user": {"name": "Bob Smith", "age": 28, "contact": {"email":
"[email protected]", "phone": {"country": "+1", "number": "555-0456"},
"address": {"street": "456 Oak Ave", "city": "LA", "state": "CA",
"coordinates": {"latitude": 34.0522, "longitude": -118.2437}}}}, "employment":
{"company": {"name": "DataFlow", "industry": "Analytics", "location":
{"building": "Campus B", "floor": 3}}, "position": {"title": "Data Scientist",
"department": "ML", "level": 3}, "compensation": {"base": 130000, "bonus":
{"target": 15000, "multiplier": 1.2}, "equity": {"vested": 2000, "unvested":
8000}}}, "projects": [{"id": "proj-003", "name": "ML Model v2", "status":
"completed", "metrics": {"linesOfCode": 8000, "coverage": 94.2}}]}
{"id": 3, "user": {"name": "Charlie Davis", "age": 35, "contact": {"email":
"[email protected]", "phone": {"country": "+44", "number": "7700-900123"},
"address": {"street": "789 High St", "city": "London", "state": "UK",
"coordinates": {"latitude": 51.5074, "longitude": -0.1278}}}}, "employment":
{"company": {"name": "GlobalTech", "industry": "Consulting", "location":
{"building": "HQ", "floor": 22}}, "position": {"title": "Tech Lead",
"department": "Architecture", "level": 5}, "compensation": {"base": 180000,
"bonus": {"target": 30000, "multiplier": 2.0}, "equity": {"vested": 10000,
"unvested": 20000}}}, "projects": [{"id": "proj-004", "name": "Cloud
Migration", "status": "planning", "metrics": {"linesOfCode": 0, "coverage":
0}}, {"id": "proj-005", "name": "Security Audit", "status": "completed",
"metrics": {"linesOfCode": 5000, "coverage": 99.1}}, {"id": "proj-006", "name":
"Performance Optimization", "status": "in-progress", "metrics": {"linesOfCode":
3500, "coverage": 88.7}}
]}`
pool := memory.NewGoAllocator()
// Create bodkin instance with options
u := bodkin.NewBodkin(
bodkin.WithInferTimeUnits(),
bodkin.WithTypeConversion(),
bodkin.WithQuotedValuesAreStrings(), // This helps with "+1"
being treated as string
)
// Unify to infer schema
err := u.Unify(jsonData)
if err != nil {
log.Fatal("Failed to unify:", err)
}
schema, err := u.Schema()
if err != nil {
log.Fatal("Failed to get schema:", err)
}
// Print schema summary
fmt.Println("=== Inferred Schema Summary ===")
for i, field := range schema.Fields() {
fmt.Printf("%d: %s (%s)\n", i, field.Name, field.Type)
}
fmt.Println()
// Create JSON reader
reader := array.NewJSONReader(bytes.NewReader([]byte(jsonData)), schema)
defer reader.Release()
// Read all records
var records []arrow.Record
for reader.Next() {
rec := reader.Record()
rec.Retain()
records = append(records, rec)
}
if err := reader.Err(); err != nil {
log.Fatal("JSON reader error:", err)
}
fmt.Printf("Read %d records successfully\n", len(records))
// Method 1: Write individual records with added column
fmt.Println("\n=== Method 1: Individual Records with Score ===")
for i, rec := range records {
// Add a score column to each record
newRec := addScoreColumn(rec, schema, float64(90+i*5), pool)
writeToParquet(newRec, newRec.Schema(),
fmt.Sprintf("output_individual_%d.parquet", i+1))
newRec.Release()
}
// Clean up
for _, rec := range records {
rec.Release()
}
}
func addScoreColumn(rec arrow.Record, schema *arrow.Schema, score float64,
pool memory.Allocator) arrow.Record {
// Create score array with same number of rows as the record
scoreBuilder := array.NewFloat64Builder(pool)
defer scoreBuilder.Release()
for i := 0; i < int(rec.NumRows()); i++ {
scoreBuilder.Append(score)
}
scoreArray := scoreBuilder.NewArray()
defer scoreArray.Release()
// Create new schema with score field
newFields := make([]arrow.Field, len(schema.Fields())+1)
copy(newFields, schema.Fields())
newFields[len(schema.Fields())] = arrow.Field{
Name: "performance_score",
Type: arrow.PrimitiveTypes.Float64,
}
newSchema := arrow.NewSchema(newFields, nil)
// Create new columns array
newColumns := make([]arrow.Array, rec.NumCols()+1)
for i := 0; i < int(rec.NumCols()); i++ {
newColumns[i] = rec.Column(i)
}
newColumns[rec.NumCols()] = scoreArray
return array.NewRecord(newSchema, newColumns, rec.NumRows())
}
func writeToParquet(record arrow.Record, schema *arrow.Schema, filename
string) {
file, err := os.Create(filename)
if err != nil {
log.Fatal("Failed to create file:", err)
}
defer file.Close()
writer, err := pqarrow.NewFileWriter(schema, file,
nil, // use default parquet properties
pqarrow.NewArrowWriterProperties())
if err != nil {
log.Fatal("Failed to create parquet writer:", err)
}
defer writer.Close()
err = writer.Write(record)
if err != nil {
log.Fatal("Failed to write parquet:", err)
}
fmt.Printf("Created %s (%d rows, %d columns)\n", filename,
record.NumRows(), record.NumCols())
}
```
when data is written out using the combination of bodkin and arrow; as you
can see the email is botched; im not sure if this is a bodkin issue or an arrow
issue. unfortunately this is a bit out of scope for me too, i'll look away from
arrow for a solution to needs here (going from JSON -> Arrow format when the
JSON is unstructured data). not sure if this is something you will want to look
into, but something to try.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]