[ https://issues.apache.org/jira/browse/SPARK-32060?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17147261#comment-17147261 ]
zhengruifeng edited comment on SPARK-32060 at 6/28/20, 8:34 AM: ---------------------------------------------------------------- {code:java} import org.apache.spark.ml.regression._ import org.apache.spark.storage.StorageLevel val df = spark.read.option("numFeatures", "2000").format("libsvm").load("/data1/Datasets/epsilon/epsilon_normalized.t") df.persist(StorageLevel.MEMORY_AND_DISK) df.count val lir = new LinearRegression().setMaxIter(200).setSolver("l-bfgs").setLoss("huber") val results = Seq(1, 4, 16, 64, 256, 1024, 4096).map { size => val start = System.currentTimeMillis; val model = lir.setBlockSize(size).fit(df); val end = System.currentTimeMillis; (size, model, end - start) } {code} model coef: {code:java} scala> results.map(_._2.coefficients).foreach(coef => println(coef.toString.take(200))) [-0.1609083025667508,-0.1504208122473649,0.7857316265190127,0.1905294278240982,0.48613646504894936,-0.026194861709278365,0.590635887747112,0.03185142111622796,8.347531055523673,0.05032008235983659,0.0 [-0.14168611353422972,-0.09988761525554064,0.5465392380563737,0.1948729061499901,0.4763355879043651,-0.3012279914216939,0.6313906259537879,0.09533675545276975,10.461020810672274,0.15677230833505942,-0 [0.0129107378236514,-0.023733643262643805,0.7206248421409548,0.1281202961920889,0.6331850100541732,-0.07297545577093478,0.7943888663518902,0.1345404102446435,10.426743282094897,0.022989137878464405,0. [0.030744371107965504,-0.18953315635218193,0.7474602191912736,0.1759290649344934,0.48334851886329333,-0.18612454543317197,0.623576899875435,0.10960148194302292,9.305819813630439,0.07680152463656026,-0 [0.06489015002773292,-0.2013517907421197,0.7090030134636589,0.05515361023479412,0.3904484093136326,0.11987256805921637,0.550217950324033,0.0557189628809737,7.24524505892832,-0.09041629158543917,0.0809 [-0.18300047132898184,-0.21732260127922864,0.8444018472270687,0.10275527109275327,0.07750772677176482,0.2282620884662859,0.5299055708518087,0.07284146396600312,7.7820378386877245,-0.014623101293592242 [-0.09575146808314546,-0.2307269364289983,0.8121553524047764,0.14527766692142594,0.4327749717709629,-0.024082387632074886,0.6239466285761414,0.03986689640912914,7.6761329131634435,-0.03111169776197065 {code} objectiveHistory is also attached was (Author: podongfeng): {code:java} import org.apache.spark.ml.regression._ import org.apache.spark.storage.StorageLevel val df = spark.read.option("numFeatures", "2000").format("libsvm").load("/data1/Datasets/epsilon/epsilon_normalized.t") df.persist(StorageLevel.MEMORY_AND_DISK) df.countval lir = new LinearRegression().setMaxIter(200).setSolver("l-bfgs").setLoss("huber") val results = Seq(1, 4, 16, 64, 256, 1024, 4096).map { size => val start = System.currentTimeMillis; val model = lir.setBlockSize(size).fit(df); val end = System.currentTimeMillis; (size, model, end - start) } {code} model coef: {code:java} scala> results.map(_._2.coefficients).foreach(coef => println(coef.toString.take(200))) [-0.1609083025667508,-0.1504208122473649,0.7857316265190127,0.1905294278240982,0.48613646504894936,-0.026194861709278365,0.590635887747112,0.03185142111622796,8.347531055523673,0.05032008235983659,0.0 [-0.14168611353422972,-0.09988761525554064,0.5465392380563737,0.1948729061499901,0.4763355879043651,-0.3012279914216939,0.6313906259537879,0.09533675545276975,10.461020810672274,0.15677230833505942,-0 [0.0129107378236514,-0.023733643262643805,0.7206248421409548,0.1281202961920889,0.6331850100541732,-0.07297545577093478,0.7943888663518902,0.1345404102446435,10.426743282094897,0.022989137878464405,0. [0.030744371107965504,-0.18953315635218193,0.7474602191912736,0.1759290649344934,0.48334851886329333,-0.18612454543317197,0.623576899875435,0.10960148194302292,9.305819813630439,0.07680152463656026,-0 [0.06489015002773292,-0.2013517907421197,0.7090030134636589,0.05515361023479412,0.3904484093136326,0.11987256805921637,0.550217950324033,0.0557189628809737,7.24524505892832,-0.09041629158543917,0.0809 [-0.18300047132898184,-0.21732260127922864,0.8444018472270687,0.10275527109275327,0.07750772677176482,0.2282620884662859,0.5299055708518087,0.07284146396600312,7.7820378386877245,-0.014623101293592242 [-0.09575146808314546,-0.2307269364289983,0.8121553524047764,0.14527766692142594,0.4327749717709629,-0.024082387632074886,0.6239466285761414,0.03986689640912914,7.6761329131634435,-0.03111169776197065 {code} objectiveHistory is also attached > Huber loss Convergence > ---------------------- > > Key: SPARK-32060 > URL: https://issues.apache.org/jira/browse/SPARK-32060 > Project: Spark > Issue Type: Sub-task > Components: ML > Affects Versions: 3.1.0 > Reporter: zhengruifeng > Priority: Minor > Attachments: huber.xlsx > > > |performace test in https://issues.apache.org/jira/browse/SPARK-31783, > Huber loss seems start to diverge since 70 iters. > {code:java} > for (size <- Seq(1, 4, 16, 64); iter <- Seq(10, 50, 100)) { > Thread.sleep(10000) > val hlir = new > LinearRegression().setLoss("huber").setSolver("l-bfgs").setMaxIter(iter).setTol(0) > val start = System.currentTimeMillis > val model = hlir.setBlockSize(size).fit(df) > val end = System.currentTimeMillis > println((model.uid, size, iter, end - start, > model.summary.objectiveHistory.last, model.summary.totalIterations, > model.coefficients.toString.take(100))) > }{code}| > | | > | | > | | > | | > | | > | | > | | > | | > |result:| > |blockSize=1| > |(linReg_887d29a0b42b,1,10,34222,12.600287516874573,11,[-1.128806276706593,8.677674008637235,9.388511222747894,8.55780534824698,34.241366265505654,26.96490)| > |(linReg_fa87d52d3e2f,1,50,134017,1.7265674039265724,51,[-1.2409375311919224,-0.36565818648554393,1.0271741000977583,-0.5264376930209739,-1.544463380879014,)| > |(linReg_b2a07f6fa653,1,100,259137,0.7519335552972538,101,[-0.3821288691282684,0.22040814987367136,0.07747613675383101,0.16130205219214436,1.2347926613828966,)| > blockSize=4| > |(linReg_779f6890aee9,4,10,7241,12.600287516879131,11,[-1.128806276706101,8.677674008649985,9.38851122275203,8.557805348259139,34.241366265511715,26.96490)| > |(linReg_0e6d961e054f,4,50,11691,1.726567383577527,51,[-1.2409376473684588,-0.3656580427637058,1.0271741488856692,-0.5264377459728347,-1.5444635623477996,)| > |(linReg_1e12fafab7d2,4,100,17966,0.796858465032771,101,[-0.014663920062692357,-0.057216366204118345,0.1764582527782608,0.12141286532514688,1.58266258533765)| > blockSize=16| > |(linReg_5ad195c843bb,16,10,7338,12.600287516896273,11,[-1.1288062767576779,8.677674008672964,9.388511222753797,8.557805348281347,34.24136626552257,26.9649)| > |(linReg_686fe7849c42,16,50,12093,1.7265673762478049,51,[-1.2409376965631724,-0.3656579898205299,1.0271741857198382,-0.5264377659307408,-1.5444636325154564,)| > |(linReg_cc934209aac1,16,100,18253,0.7844992170383625,101,[-0.4230952901291041,0.08770018558785676,0.2719402480140563,0.08602481376955884,0.8763149744964053,-)| > blockSize=64| > |(linReg_2de48672cf40,64,10,7956,12.600287516883563,11,[-1.1288062767198885,8.677674008655007,9.388511222751507,8.557805348264019,34.24136626551386,26.9649)| > |(linReg_a4ed072bdf00,64,50,14423,1.7265674032944005,51,[-1.240937585330031,-0.36565823041213286,1.02717419529322,-0.5264376482700692,-1.5444634018412484,0.)| > |(linReg_ed9bf8e6db3d,64,100,22680,0.7508904951409897,101,[-0.39923222418441695,0.2591603128603928,0.025707538173424214,0.06178131424518882,1.3651702157456522)| -- This message was sent by Atlassian Jira (v8.3.4#803005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org