Pretty straight forward as pointed out by Ted.

--read csv file into a df
val df ="com.databricks.spark.csv").option("inferSchema",
"true").option("header", "true").load("/data/stg/table2")

scala> df.printSchema
 |-- Invoice Number: string (nullable = true)
 |-- Payment date: string (nullable = true)
 |-- Net: string (nullable = true)
 |-- VAT: string (nullable = true)
 |-- Total: string (nullable = true)
--rename the first column as InvoiceNumber getting rid of space
scala> val df_1 = df.withColumnRenamed("Invoice Number","InvoiceNumber")
df_1: org.apache.spark.sql.DataFrame = [InvoiceNumber: string, Payment
date: string, Net: string, VAT: string, Total: string]
--drop column Total
scala> val df_2 = df_1.drop("Total")
df_2: org.apache.spark.sql.DataFrame = [InvoiceNumber: string, Payment
date: string, Net: string, VAT: string]
-- Change InvoiceNumber from String to Integer
scala> val df_3 = convertColumn(df_2, "InvoiceNumber","Integer")
df_3: org.apache.spark.sql.DataFrame = [Payment date: string, Net: string,
VAT: string, InvoiceNumber: int]


Reply via email to