[jira] [Comment Edited] (SPARK-32060) Huber loss Convergence

2020-06-30 Thread zhengruifeng (Jira)


[ 
https://issues.apache.org/jira/browse/SPARK-32060?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17148412#comment-17148412
 ] 

zhengruifeng edited comment on SPARK-32060 at 6/30/20, 8:20 AM:


I found that the optimization of Huber Loss is unstable, if the input dataset 
is shuffled:

 

spark: 2.4.5

cmd:spark-shell --driver-memory=96G --conf spark.driver.maxResultSize=10g

 
{code:java}
import org.apache.spark.ml.classification._
import org.apache.spark.ml.regression._
import org.apache.spark.storage.StorageLevel
val df = spark.read.option("numFeatures", 
"2000").format("libsvm").load("/data1/Datasets/epsilon/epsilon_normalized.t").withColumn("label",
 (col("label")+1)/2)
df.persist(StorageLevel.MEMORY_AND_DISK)
df.count

val svc = new LinearSVC().setMaxIter(100).setTol(0)
val svcmodel = svc.fit(df)
val svcmodels = Seq.range(0, 5).map { seed => val df2 = 
df.sort(rand(seed)).persist(StorageLevel.MEMORY_AND_DISK); df2.count; val model 
= svc.fit(df2); df2.unpersist(); model }

val lir = new LinearRegression().setMaxIter(100).setSolver("l-bfgs").setTol(0)
val lirmodel = lir.fit(df)
val lirmodels = Seq.range(0, 5).map { seed => val df2 = 
df.sort(rand(seed)).persist(StorageLevel.MEMORY_AND_DISK); df2.count; val model 
= lir.fit(df2); df2.unpersist(); model }

val lir = new 
LinearRegression().setMaxIter(100).setSolver("l-bfgs").setLoss("huber").setTol(0)
val hubermodel = lir.fit(df)
val hubermodels = Seq.range(0, 5).map { seed => val df2 = 
df.sort(rand(seed)).persist(StorageLevel.MEMORY_AND_DISK); df2.count; val model 
= lir.fit(df2); df2.unpersist(); model }

 {code}
 

results:
{code:java}
scala> svcmodel.coefficients
res4: org.apache.spark.ml.linalg.Vector = 
[-2.0998984926889244,-0.3252719796828287,1.6224962635038596,-0.09459144575027117,-0.024713074721534507,-0.04893864248356599,4.521280777017717,-0.3920439314738444,0.027631053567458274,-0.010013241182040592,0.3259469228241217,-1.1125182474604842,-0.17320114704,0.03365461088305983,0.02162518688538647,0.07204641375676599,-0.07429479630422156,0.005457557625321678,-2.9532140605652275,2.7240907567070676,-0.3066203528914533,-0.11475917863808731,-0.20761462370516978,-0.4066885419952761,-0.08185889069309363,-0.6318876493014741,0.06405628348073204,-0.32732378261855793,0.01462176019045602,0.8492238295542848,-0.20854294380974547,0.008039275953692854,0.05597077397428801,-0.06302333216930013,0.005602373131582006,-0.05995911252186677,0.3381639630496303,0.63...

scala> 
svcmodels.map(_.coefficients.toArray.take(10).mkString(",")).foreach(println)
-2.099898492688956,-0.32527197968284316,1.622496263503862,-0.09459144575027553,-0.02471307472153921,-0.048938642483572034,4.521280777017707,-0.39204393147385175,0.027631053567457448,-0.010013241182041081
-2.0998984926889364,-0.32527197968284405,1.6224962635038631,-0.09459144575027562,-0.024713074721539482,-0.0489386424835727,4.521280777017712,-0.3920439314738487,0.027631053567458107,-0.010013241182040594
-2.0998984926889346,-0.3252719796828256,1.6224962635038638,-0.09459144575027183,-0.02471307472153609,-0.04893864248356505,4.52128077701772,-0.392043931473848,0.027631053567458715,-0.010013241182040738
-2.0998984926889612,-0.3252719796828379,1.6224962635038718,-0.09459144575027294,-0.024713074721542546,-0.04893864248357126,4.521280777017708,-0.3920439314738543,0.027631053567458354,-0.01001324118203996
-2.099898492688976,-0.32527197968284066,1.6224962635038813,-0.09459144575027299,-0.024713074721538494,-0.04893864248357121,4.521280777017711,-0.39204393147382943,0.027631053567457917,-0.01001324118203988



scala> lirmodel.coefficients
res6: org.apache.spark.ml.linalg.Vector = 
[-0.15694030457077052,-0.03297314855191394,0.21896060695714925,0.043191022987982185,0.029914098626947626,0.037812647639103455,0.3777274539423792,-0.004353078286124242,-0.15575156684399277,-5.036838920393178E-4,0.058833881325688855,-0.08017543724230564,-0.0492390194915689,-0.038321196923869975,-0.130894554829739,-0.042355695456345384,-0.009015454450718942,-0.04370761619435822,-0.29579850779021977,0.004520096449137435,-0.006967975798308324,-0.06845005746315802,0.266918095634905,-0.02940614765439654,-0.026540163642704145,-0.06595165721239701,0.014980713825836588,-0.04093065488345465,0.01959430385382978,0.050848208119031076,-0.05637861639180545,0.14531387392683578,-0.010366403339646989,-0.06789567679629482,-0.01138250848452352,0.012813168874534375,0...

scala> 
lirmodels.map(_.coefficients.toArray.take(10).mkString(",")).foreach(println)
-0.15782290165350424,-0.032888518926744874,0.21830737648275617,0.042901864085398485,0.030379578212786106,0.03726297774165828,0.37590079059854775,-0.005096797676258801,-0.16294717929387345,-3.4683642628292866E-4
-0.158134002701788,-0.03297488977298756,0.223484223933237,0.04362644996012737,0.02955839240619015,0.037886954492652274,0.3781987525027171,-0.0042601468957566635,-0.15377036026585994,-7.808247198886335E-4

[jira] [Comment Edited] (SPARK-32060) Huber loss Convergence

2020-06-30 Thread zhengruifeng (Jira)


[ 
https://issues.apache.org/jira/browse/SPARK-32060?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17148412#comment-17148412
 ] 

zhengruifeng edited comment on SPARK-32060 at 6/30/20, 8:17 AM:


I found that the optimization of Huber Loss is unstable, if the input dataset 
is shuffled:

 

spark: 2.4.5

cmd:spark-shell --driver-memory=96G --conf spark.driver.maxResultSize=10g

 
{code:java}
import org.apache.spark.ml.classification._
import org.apache.spark.ml.regression._
import org.apache.spark.storage.StorageLevel
val df = spark.read.option("numFeatures", 
"2000").format("libsvm").load("/data1/Datasets/epsilon/epsilon_normalized.t").withColumn("label",
 (col("label")+1)/2)df.persist(StorageLevel.MEMORY_AND_DISK)
df.count

val svc = new LinearSVC().setMaxIter(100).setTol(0)
val svcmodel = svc.fit(df)
val svcmodels = Seq.range(0, 5).map { seed => val df2 = 
df.sort(rand(seed)).persist(StorageLevel.MEMORY_AND_DISK); df2.count; val model 
= svc.fit(df2); df2.unpersist(); model }

val lir = new LinearRegression().setMaxIter(100).setSolver("l-bfgs").setTol(0)
val lirmodel = lir.fit(df)
val lirmodels = Seq.range(0, 5).map { seed => val df2 = 
df.sort(rand(seed)).persist(StorageLevel.MEMORY_AND_DISK); df2.count; val model 
= lir.fit(df2); df2.unpersist(); model }

val lir = new 
LinearRegression().setMaxIter(100).setSolver("l-bfgs").setLoss("huber").setTol(0)
val hubermodel = lir.fit(df)
val hubermodels = Seq.range(0, 5).map { seed => val df2 = 
df.sort(rand(seed)).persist(StorageLevel.MEMORY_AND_DISK); df2.count; val model 
= lir.fit(df2); df2.unpersist(); model }

 {code}
 

results:
{code:java}
scala> svcmodel.coefficients
res4: org.apache.spark.ml.linalg.Vector = 
[-2.0998984926889244,-0.3252719796828287,1.6224962635038596,-0.09459144575027117,-0.024713074721534507,-0.04893864248356599,4.521280777017717,-0.3920439314738444,0.027631053567458274,-0.010013241182040592,0.3259469228241217,-1.1125182474604842,-0.17320114704,0.03365461088305983,0.02162518688538647,0.07204641375676599,-0.07429479630422156,0.005457557625321678,-2.9532140605652275,2.7240907567070676,-0.3066203528914533,-0.11475917863808731,-0.20761462370516978,-0.4066885419952761,-0.08185889069309363,-0.6318876493014741,0.06405628348073204,-0.32732378261855793,0.01462176019045602,0.8492238295542848,-0.20854294380974547,0.008039275953692854,0.05597077397428801,-0.06302333216930013,0.005602373131582006,-0.05995911252186677,0.3381639630496303,0.63...

scala> 
svcmodels.map(_.coefficients.toArray.take(10).mkString(",")).foreach(println)
-2.099898492688956,-0.32527197968284316,1.622496263503862,-0.09459144575027553,-0.02471307472153921,-0.048938642483572034,4.521280777017707,-0.39204393147385175,0.027631053567457448,-0.010013241182041081
-2.0998984926889364,-0.32527197968284405,1.6224962635038631,-0.09459144575027562,-0.024713074721539482,-0.0489386424835727,4.521280777017712,-0.3920439314738487,0.027631053567458107,-0.010013241182040594
-2.0998984926889346,-0.3252719796828256,1.6224962635038638,-0.09459144575027183,-0.02471307472153609,-0.04893864248356505,4.52128077701772,-0.392043931473848,0.027631053567458715,-0.010013241182040738
-2.0998984926889612,-0.3252719796828379,1.6224962635038718,-0.09459144575027294,-0.024713074721542546,-0.04893864248357126,4.521280777017708,-0.3920439314738543,0.027631053567458354,-0.01001324118203996
-2.099898492688976,-0.32527197968284066,1.6224962635038813,-0.09459144575027299,-0.024713074721538494,-0.04893864248357121,4.521280777017711,-0.39204393147382943,0.027631053567457917,-0.01001324118203988



scala> lirmodel.coefficients
res6: org.apache.spark.ml.linalg.Vector = 
[-0.15694030457077052,-0.03297314855191394,0.21896060695714925,0.043191022987982185,0.029914098626947626,0.037812647639103455,0.3777274539423792,-0.004353078286124242,-0.15575156684399277,-5.036838920393178E-4,0.058833881325688855,-0.08017543724230564,-0.0492390194915689,-0.038321196923869975,-0.130894554829739,-0.042355695456345384,-0.009015454450718942,-0.04370761619435822,-0.29579850779021977,0.004520096449137435,-0.006967975798308324,-0.06845005746315802,0.266918095634905,-0.02940614765439654,-0.026540163642704145,-0.06595165721239701,0.014980713825836588,-0.04093065488345465,0.01959430385382978,0.050848208119031076,-0.05637861639180545,0.14531387392683578,-0.010366403339646989,-0.06789567679629482,-0.01138250848452352,0.012813168874534375,0...

scala> 
lirmodels.map(_.coefficients.toArray.take(10).mkString(",")).foreach(println)
-0.15782290165350424,-0.032888518926744874,0.21830737648275617,0.042901864085398485,0.030379578212786106,0.03726297774165828,0.37590079059854775,-0.005096797676258801,-0.16294717929387345,-3.4683642628292866E-4
-0.158134002701788,-0.03297488977298756,0.223484223933237,0.04362644996012737,0.02955839240619015,0.037886954492652274,0.3781987525027171,-0.0042601468957566635,-0.15377036026585994,-7.808247198886335E-4

[jira] [Comment Edited] (SPARK-32060) Huber loss Convergence

2020-06-28 Thread zhengruifeng (Jira)


[ 
https://issues.apache.org/jira/browse/SPARK-32060?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17147261#comment-17147261
 ] 

zhengruifeng edited comment on SPARK-32060 at 6/28/20, 8:34 AM:


{code:java}
import org.apache.spark.ml.regression._
import org.apache.spark.storage.StorageLevel
val df = spark.read.option("numFeatures", 
"2000").format("libsvm").load("/data1/Datasets/epsilon/epsilon_normalized.t")
df.persist(StorageLevel.MEMORY_AND_DISK)
df.count

val lir = new 
LinearRegression().setMaxIter(200).setSolver("l-bfgs").setLoss("huber")

val results = Seq(1, 4, 16, 64, 256, 1024, 4096).map { size => val start = 
System.currentTimeMillis; val model = lir.setBlockSize(size).fit(df); val end = 
System.currentTimeMillis; (size, model, end - start) } {code}
 

model coef:
{code:java}
scala> results.map(_._2.coefficients).foreach(coef => 
println(coef.toString.take(200)))
[-0.1609083025667508,-0.1504208122473649,0.7857316265190127,0.1905294278240982,0.48613646504894936,-0.026194861709278365,0.590635887747112,0.03185142111622796,8.347531055523673,0.05032008235983659,0.0
[-0.14168611353422972,-0.09988761525554064,0.5465392380563737,0.1948729061499901,0.4763355879043651,-0.3012279914216939,0.6313906259537879,0.09533675545276975,10.461020810672274,0.15677230833505942,-0
[0.0129107378236514,-0.023733643262643805,0.7206248421409548,0.1281202961920889,0.6331850100541732,-0.07297545577093478,0.7943888663518902,0.1345404102446435,10.426743282094897,0.022989137878464405,0.
[0.030744371107965504,-0.18953315635218193,0.7474602191912736,0.1759290649344934,0.48334851886329333,-0.18612454543317197,0.623576899875435,0.10960148194302292,9.305819813630439,0.07680152463656026,-0
[0.06489015002773292,-0.2013517907421197,0.7090030134636589,0.05515361023479412,0.3904484093136326,0.11987256805921637,0.550217950324033,0.0557189628809737,7.24524505892832,-0.09041629158543917,0.0809
[-0.18300047132898184,-0.21732260127922864,0.8444018472270687,0.10275527109275327,0.07750772677176482,0.2282620884662859,0.5299055708518087,0.07284146396600312,7.7820378386877245,-0.014623101293592242
[-0.09575146808314546,-0.2307269364289983,0.8121553524047764,0.14527766692142594,0.4327749717709629,-0.024082387632074886,0.6239466285761414,0.03986689640912914,7.6761329131634435,-0.0369776197065
 {code}
 

objectiveHistory is also attached

 


was (Author: podongfeng):
{code:java}
import org.apache.spark.ml.regression._
import org.apache.spark.storage.StorageLevel
val df = spark.read.option("numFeatures", 
"2000").format("libsvm").load("/data1/Datasets/epsilon/epsilon_normalized.t")
df.persist(StorageLevel.MEMORY_AND_DISK)
df.countval lir = new 
LinearRegression().setMaxIter(200).setSolver("l-bfgs").setLoss("huber")

val results = Seq(1, 4, 16, 64, 256, 1024, 4096).map { size => val start = 
System.currentTimeMillis; val model = lir.setBlockSize(size).fit(df); val end = 
System.currentTimeMillis; (size, model, end - start) } {code}
 

model coef:
{code:java}
scala> results.map(_._2.coefficients).foreach(coef => 
println(coef.toString.take(200)))
[-0.1609083025667508,-0.1504208122473649,0.7857316265190127,0.1905294278240982,0.48613646504894936,-0.026194861709278365,0.590635887747112,0.03185142111622796,8.347531055523673,0.05032008235983659,0.0
[-0.14168611353422972,-0.09988761525554064,0.5465392380563737,0.1948729061499901,0.4763355879043651,-0.3012279914216939,0.6313906259537879,0.09533675545276975,10.461020810672274,0.15677230833505942,-0
[0.0129107378236514,-0.023733643262643805,0.7206248421409548,0.1281202961920889,0.6331850100541732,-0.07297545577093478,0.7943888663518902,0.1345404102446435,10.426743282094897,0.022989137878464405,0.
[0.030744371107965504,-0.18953315635218193,0.7474602191912736,0.1759290649344934,0.48334851886329333,-0.18612454543317197,0.623576899875435,0.10960148194302292,9.305819813630439,0.07680152463656026,-0
[0.06489015002773292,-0.2013517907421197,0.7090030134636589,0.05515361023479412,0.3904484093136326,0.11987256805921637,0.550217950324033,0.0557189628809737,7.24524505892832,-0.09041629158543917,0.0809
[-0.18300047132898184,-0.21732260127922864,0.8444018472270687,0.10275527109275327,0.07750772677176482,0.2282620884662859,0.5299055708518087,0.07284146396600312,7.7820378386877245,-0.014623101293592242
[-0.09575146808314546,-0.2307269364289983,0.8121553524047764,0.14527766692142594,0.4327749717709629,-0.024082387632074886,0.6239466285761414,0.03986689640912914,7.6761329131634435,-0.0369776197065
 {code}
 

objectiveHistory is also attached

 

> Huber loss Convergence
> --
>
> Key: SPARK-32060
> URL: https://issues.apache.org/jira/browse/SPARK-32060
> Project: Spark
>  Issue Type: Sub-task
>  Components: ML
>Affects Versions: 3.1.0
>Reporter: zhengruifeng
>Priority: Minor
> Attachments: huber.xlsx
>
>
> 

[jira] [Comment Edited] (SPARK-32060) Huber loss Convergence

2020-06-28 Thread zhengruifeng (Jira)


[ 
https://issues.apache.org/jira/browse/SPARK-32060?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17147261#comment-17147261
 ] 

zhengruifeng edited comment on SPARK-32060 at 6/28/20, 8:34 AM:


{code:java}
import org.apache.spark.ml.regression._
import org.apache.spark.storage.StorageLevel
val df = spark.read.option("numFeatures", 
"2000").format("libsvm").load("/data1/Datasets/epsilon/epsilon_normalized.t")
df.persist(StorageLevel.MEMORY_AND_DISK)
df.countval lir = new 
LinearRegression().setMaxIter(200).setSolver("l-bfgs").setLoss("huber")

val results = Seq(1, 4, 16, 64, 256, 1024, 4096).map { size => val start = 
System.currentTimeMillis; val model = lir.setBlockSize(size).fit(df); val end = 
System.currentTimeMillis; (size, model, end - start) } {code}
 

model coef:
{code:java}
scala> results.map(_._2.coefficients).foreach(coef => 
println(coef.toString.take(200)))
[-0.1609083025667508,-0.1504208122473649,0.7857316265190127,0.1905294278240982,0.48613646504894936,-0.026194861709278365,0.590635887747112,0.03185142111622796,8.347531055523673,0.05032008235983659,0.0
[-0.14168611353422972,-0.09988761525554064,0.5465392380563737,0.1948729061499901,0.4763355879043651,-0.3012279914216939,0.6313906259537879,0.09533675545276975,10.461020810672274,0.15677230833505942,-0
[0.0129107378236514,-0.023733643262643805,0.7206248421409548,0.1281202961920889,0.6331850100541732,-0.07297545577093478,0.7943888663518902,0.1345404102446435,10.426743282094897,0.022989137878464405,0.
[0.030744371107965504,-0.18953315635218193,0.7474602191912736,0.1759290649344934,0.48334851886329333,-0.18612454543317197,0.623576899875435,0.10960148194302292,9.305819813630439,0.07680152463656026,-0
[0.06489015002773292,-0.2013517907421197,0.7090030134636589,0.05515361023479412,0.3904484093136326,0.11987256805921637,0.550217950324033,0.0557189628809737,7.24524505892832,-0.09041629158543917,0.0809
[-0.18300047132898184,-0.21732260127922864,0.8444018472270687,0.10275527109275327,0.07750772677176482,0.2282620884662859,0.5299055708518087,0.07284146396600312,7.7820378386877245,-0.014623101293592242
[-0.09575146808314546,-0.2307269364289983,0.8121553524047764,0.14527766692142594,0.4327749717709629,-0.024082387632074886,0.6239466285761414,0.03986689640912914,7.6761329131634435,-0.0369776197065
 {code}
 

objectiveHistory is also attached

 


was (Author: podongfeng):
{code:java}
import org.apache.spark.ml.regression._
import org.apache.spark.storage.StorageLevel
val df = spark.read.option("numFeatures", 
"2000").format("libsvm").load("/data1/Datasets/epsilon/epsilon_normalized.t")
df.persist(StorageLevel.MEMORY_AND_DISK)
df.countval lir = new 
LinearRegression().setMaxIter(200).setSolver("l-bfgs").setLoss("huber")val 
results = Seq(1, 4, 16, 64, 256, 1024, 4096).map { size => val start = 
System.currentTimeMillis; val model = lir.setBlockSize(size).fit(df); val end = 
System.currentTimeMillis; (size, model, end - start) } {code}
 

model coef:
{code:java}
scala> results.map(_._2.coefficients).foreach(coef => 
println(coef.toString.take(200)))
[-0.1609083025667508,-0.1504208122473649,0.7857316265190127,0.1905294278240982,0.48613646504894936,-0.026194861709278365,0.590635887747112,0.03185142111622796,8.347531055523673,0.05032008235983659,0.0
[-0.14168611353422972,-0.09988761525554064,0.5465392380563737,0.1948729061499901,0.4763355879043651,-0.3012279914216939,0.6313906259537879,0.09533675545276975,10.461020810672274,0.15677230833505942,-0
[0.0129107378236514,-0.023733643262643805,0.7206248421409548,0.1281202961920889,0.6331850100541732,-0.07297545577093478,0.7943888663518902,0.1345404102446435,10.426743282094897,0.022989137878464405,0.
[0.030744371107965504,-0.18953315635218193,0.7474602191912736,0.1759290649344934,0.48334851886329333,-0.18612454543317197,0.623576899875435,0.10960148194302292,9.305819813630439,0.07680152463656026,-0
[0.06489015002773292,-0.2013517907421197,0.7090030134636589,0.05515361023479412,0.3904484093136326,0.11987256805921637,0.550217950324033,0.0557189628809737,7.24524505892832,-0.09041629158543917,0.0809
[-0.18300047132898184,-0.21732260127922864,0.8444018472270687,0.10275527109275327,0.07750772677176482,0.2282620884662859,0.5299055708518087,0.07284146396600312,7.7820378386877245,-0.014623101293592242
[-0.09575146808314546,-0.2307269364289983,0.8121553524047764,0.14527766692142594,0.4327749717709629,-0.024082387632074886,0.6239466285761414,0.03986689640912914,7.6761329131634435,-0.0369776197065
 {code}
 

objectiveHistory is also attached

 

> Huber loss Convergence
> --
>
> Key: SPARK-32060
> URL: https://issues.apache.org/jira/browse/SPARK-32060
> Project: Spark
>  Issue Type: Sub-task
>  Components: ML
>Affects Versions: 3.1.0
>Reporter: zhengruifeng
>Priority: Minor
> Attachments: huber.xlsx
>
>
>