I am running my spark (1.5.2) instance in a virtualbox VM. I have 10gb
memory allocated to it.
I have a fact table extract, with 10000 rows
var glbalance_df_select = glbalance_df.select
("LEDGER_ID","CODE_COMBINATION_ID","CURRENCY_CODE",
"PERIOD_TYPE","TEMPLATE_ID",
"PERIOD_NAME","ACTUAL_FLAG","BUDGET_VERSION_ID",
"TRANSLATED_FLAG","PERIOD_NET_DR","PERIOD_NET_CR",
"BEGIN_BALANCE_DR","BEGIN_BALANCE_CR")
.filter(
not(glbalance_df("CURRENCY_CODE")=== "STAT")
and
(glbalance_df("TEMPLATE_ID").isNull || glbalance_df("TEMPLATE_ID") ===
"None")
and
(glbalance_df("TRANSLATED_FLAG") === "Y" ||
glbalance_df("TRANSLATED_FLAG").isNull || glbalance_df("TRANSLATED_FLAG")
=== "None" )
and
(glbalance_df("ACTUAL_FLAG") === "A" or glbalance_df("ACTUAL_FLAG") === "B")
)
I am joining the fact table to the first dimension (with 100 rows).
var glbalance_ledger_df = glbalance_df_select.join(ledger_df_select,
glbalance_df_select("LEDGER_ID") <=>
ledger_df_select("LEDGER_ID"),
"inner" )
.drop(ledger_df_select("LEDGER_ID"))
When I save the DataFrame "glbalance_ledger_df" to a textfile , it saves the
data in 1 mins
2nd dimension dataframe
tableName = "w_gl_period_d"
var period_df_select = msc.table(s"$dbName.$tableName")
period_df_select = period_df_select.select("PERIOD_NAME",
"PERIOD_TYPE",
"PERIOD_SET_NAME"
,"START_DATE","END_DATE" ).cache()
Now I join the 2nd dimension DF to the resultant of the fact DF and save the
data, it takes 2hrs.
var glbalance_ledger_period_df = glbalance_ledger_df.join(period_df_select,
glbalance_ledger_df("PERIOD_SET_NAME") <=>
period_df_select("PERIOD_SET_NAME")
&& glbalance_ledger_df("PERIOD_NAME") <=>
period_df_select("PERIOD_NAME")
&& glbalance_ledger_df("PERIOD_TYPE") <=>
period_df_select("PERIOD_TYPE")
,
"inner" )
How do I improve the performance of the join?
Thx,
Subhajit