http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/bin/resources/country.txt ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/bin/resources/country.txt b/community/mahout-mr/examples/bin/resources/country.txt new file mode 100644 index 0000000..6a22091 --- /dev/null +++ b/community/mahout-mr/examples/bin/resources/country.txt @@ -0,0 +1,229 @@ +Afghanistan +Albania +Algeria +American Samoa +Andorra +Angola +Anguilla +Antigua and Barbuda +Argentina +Armenia +Aruba +Australia +Austria +Azerbaijan +Bahamas +Bangladesh +Barbados +Belarus +Belgium +Belize +Benin +Bermuda +Bhutan +Bolivia +Bosnia and Herzegovina +Botswana +Bouvet Island +Brazil +British Indian Ocean Territory +Brunei Darussalam +Bulgaria +Burkina Faso +Burundi +Cambodia +Cameroon +Canada +Cape Verde +Cayman Islands +Central African Republic +Chad +Chile +China +Christmas Island +Cocos Islands +Colombia +Comoros +Congo +Cook Islands +Costa Rica +Croatia +C�te d'Ivoire +Cuba +Cyprus +Czech Republic +Djibouti +Dominica +Dominican Republic +Ecuador +Egypt +El Salvador +Equatorial Guinea +Eritrea +Estonia +Ethiopia +Falkland Islands +Faroe Islands +Fiji +Finland +France +French Guiana +French Polynesia +French Southern Territories +Gabon +Georgia +Germany +Ghana +Gibraltar +Greece +Greenland +Grenada +Guadeloupe +Guam +Guatemala +Guernsey +Guinea +Guinea-Bissau +Guyana +Haiti +Honduras +Hong Kong +Hungary +Iceland +India +Indonesia +Iran +Iraq +Ireland +Isle of Man +Israel +Italy +Japan +Jersey +Jordan +Kazakhstan +Kenya +Kiribati +Korea +Kuwait +Kyrgyzstan +Latvia +Lebanon +Lesotho +Liberia +Liechtenstein +Lithuania +Luxembourg +Macedonia +Madagascar +Malawi +Malaysia +Maldives +Mali +Malta +Marshall Islands +Martinique +Mauritania +Mauritius +Mayotte +Mexico +Micronesia +Moldova +Monaco +Mongolia +Montenegro +Montserrat +Morocco +Mozambique +Myanmar +Namibia +Nauru +Nepal +Netherlands +Netherlands Antilles +New Caledonia +New Zealand +Nicaragua +Niger +Nigeria +Niue +Norfolk Island +Northern Mariana Islands +Norway +Oman +Pakistan +Palau +Palestinian Territory +Panama +Papua New Guinea +Paraguay +Peru +Philippines +Pitcairn +Poland +Portugal +Puerto Rico +Qatar +R�union +Russian Federation +Rwanda +Saint Barth�lemy +Saint Helena +Saint Kitts and Nevis +Saint Lucia +Saint Martin +Saint Pierre and Miquelon +Saint Vincent and the Grenadines +Samoa +San Marino +Sao Tome and Principe +Saudi Arabia +Senegal +Serbia +Seychelles +Sierra Leone +Singapore +Slovakia +Slovenia +Solomon Islands +Somalia +South Africa +South Georgia and the South Sandwich Islands +Spain +Sri Lanka +Sudan +Suriname +Svalbard and Jan Mayen +Swaziland +Sweden +Switzerland +Syrian Arab Republic +Taiwan +Tanzania +Thailand +Timor-Leste +Togo +Tokelau +Tonga +Trinidad and Tobago +Tunisia +Turkey +Turkmenistan +Turks and Caicos Islands +Tuvalu +Ukraine +United Arab Emirates +United Kingdom +United States +United States Minor Outlying Islands +Uruguay +Uzbekistan +Vanuatu +Vatican +Venezuela +Vietnam +Virgin Islands +Wallis and Futuna +Yemen +Zambia +Zimbabwe
http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/bin/resources/country10.txt ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/bin/resources/country10.txt b/community/mahout-mr/examples/bin/resources/country10.txt new file mode 100644 index 0000000..97a63e1 --- /dev/null +++ b/community/mahout-mr/examples/bin/resources/country10.txt @@ -0,0 +1,10 @@ +Australia +Austria +Bahamas +Canada +Colombia +Cuba +Panama +Pakistan +United Kingdom +Vietnam http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/bin/resources/country2.txt ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/bin/resources/country2.txt b/community/mahout-mr/examples/bin/resources/country2.txt new file mode 100644 index 0000000..f4b4f61 --- /dev/null +++ b/community/mahout-mr/examples/bin/resources/country2.txt @@ -0,0 +1,2 @@ +United States +United Kingdom http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/bin/resources/donut-test.csv ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/bin/resources/donut-test.csv b/community/mahout-mr/examples/bin/resources/donut-test.csv new file mode 100644 index 0000000..46ea564 --- /dev/null +++ b/community/mahout-mr/examples/bin/resources/donut-test.csv @@ -0,0 +1,41 @@ +"x","y","shape","color","xx","xy","yy","c","a","b" +0.802415437065065,0.0978854028508067,21,2,0.643870533640319,0.07854475831082,0.00958155209126472,0.503141377562721,0.808363832523192,0.220502180491382 +0.97073650965467,0.989339149091393,23,2,0.942329371176533,0.96038763245370,0.978791951924881,0.67900343471543,1.38604520961670,0.989771844311643 +0.566630310611799,0.369259539060295,25,1,0.321069908904024,0.209233647314105,0.136352607187021,0.146740132271139,0.676330182744379,0.569352171215186 +0.377948862500489,0.500907538458705,24,1,0.142845342665413,0.189317434378387,0.250908362084759,0.122054511555201,0.62749797190921,0.79865886318828 +0.0133881184738129,0.269793515326455,25,2,0.000179241716268851,0.00361202754665705,0.0727885409122062,0.538317888266967,0.270125494221621,1.02283505301727 +0.395229484187439,0.385281964903697,25,1,0.156206345171069,0.152274792255611,0.148442192480054,0.155361155247979,0.551949760078871,0.717070128562224 +0.757145672803745,0.416044564917684,21,1,0.573269569845435,0.315006342020941,0.173093079997545,0.270503996498299,0.863922826323613,0.481737796145881 +0.589166145538911,0.971624446567148,24,2,0.347116747049177,0.572448230095344,0.944054065166917,0.479979395505718,1.13629697360157,1.05491161769044 +0.843438957352191,0.218833807157353,25,2,0.711389274779351,0.184572958142208,0.0478882351549814,0.443852166182378,0.871365313708512,0.269071728782402 +0.628562391968444,0.801476288354024,25,2,0.395090680597092,0.503777852913796,0.642364240793743,0.327744170151609,1.01855531091386,0.8833629703887 +0.262267543468624,0.247060472844169,22,2,0.0687842643570668,0.0647959433010369,0.0610388772419841,0.347124077652729,0.360309785599907,0.778002605819416 +0.738417695043609,0.562460686312988,21,1,0.545260692353516,0.415330923539883,0.316362023647678,0.246463657857698,0.928236347058869,0.620312280963368 +0.498857178725302,0.164454092038795,21,1,0.248858484765768,0.0820391043843046,0.0270451483883046,0.335547854098302,0.525265297877247,0.527436513434051 +0.499293045606464,0.733599063009024,25,1,0.249293545390979,0.366280910423824,0.538167585247717,0.233600132755117,0.88739006679064,0.888186376514393 +0.553942533675581,0.548312899889424,24,1,0.306852330614922,0.303733837011753,0.30064703618515,0.0724150069741539,0.779422457207946,0.706833997094728 +0.661088703200221,0.98143746308051,24,2,0.43703827349895,0.64881721974001,0.963219493937908,0.507672730364875,1.1833248782295,1.03830648704340 +0.492181566543877,0.376017479225993,23,1,0.242242694445585,0.185068871973329,0.141389144683470,0.124228794404457,0.619380205632255,0.63187712891139 +0.991064163157716,0.216620326042175,21,2,0.982208175495505,0.21468464215194,0.0469243656546183,0.566963889458783,1.01446170018888,0.21680455446021 +0.601602173643187,0.343355831922963,24,1,0.361925175332207,0.206563614817919,0.117893227315510,0.186709392055052,0.692689254029335,0.52594111396747 +0.0397100185509771,0.0602901463862509,25,2,0.00157688557331895,0.00239412283143915,0.00363490175127556,0.636562347604197,0.0721927096360464,0.962180726382856 +0.158290433697402,0.630195834673941,23,2,0.0250558614001118,0.0997539719848347,0.397146790040385,0.365672507948237,0.649771230080632,1.05148551299849 +0.967184047214687,0.497705311980098,25,2,0.935444981186582,0.48137263796116,0.247710577573207,0.467189682639721,1.08772954302059,0.498785990511377 +0.538070349488407,0.0130743277259171,24,2,0.289519700998577,0.00703490808881019,0.000170938045484685,0.488411672495383,0.538229169633216,0.462114639529248 +0.758642012253404,0.673675778554752,25,2,0.575537702755893,0.511078748249156,0.453839054611352,0.311542880770993,1.01458206044028,0.715606548922268 +0.986405614530668,0.981674374546856,21,2,0.972996036377624,0.9683291146939,0.96368457764196,0.684544100071034,1.39164672744903,0.981768498658543 +0.51937106740661,0.462004136526957,23,1,0.269746305659081,0.239951581534275,0.213447822168019,0.0426488439882434,0.695121664046734,0.666672328069706 +0.534244359936565,0.692785677267238,21,1,0.28541703612403,0.370116840724856,0.479951994626626,0.195803456422130,0.87485371963012,0.83479357381183 +0.0795328004751354,0.536029864801094,22,2,0.00632546635141770,0.0426319562859392,0.287328015958679,0.422008076977050,0.541898036820671,1.06517035321108 +0.330987347057089,0.804738595616072,23,2,0.10955262391189,0.266358292837412,0.647604207274128,0.348469350894533,0.870147591610767,1.04650950166343 +0.9804020607844,0.74571731640026,25,2,0.961188200790297,0.731102793761427,0.556094315979205,0.539595348001485,1.23178022259229,0.745974795285138 +0.362560331821442,0.805498170899227,21,2,0.131449994210474,0.292041684122788,0.648827303322001,0.334990738397057,0.883333061496328,1.02720817456326 +0.47635925677605,0.961423690896481,21,2,0.226918141516230,0.457983074842334,0.924335513417013,0.462028903057712,1.07296488988841,1.09477629741475 +0.850710266502574,0.635807712096721,24,2,0.723707957532881,0.540888148202193,0.404251446761667,0.376086992190972,1.06205433208219,0.65309943445803 +0.136131341336295,0.714137809583917,25,2,0.0185317420940189,0.0972165379176223,0.509992811077315,0.422203034393551,0.726996941651981,1.12083088398685 +0.930458213202655,0.865616530412808,24,2,0.865752486516278,0.805420010206583,0.749291977723908,0.564774043865972,1.27084399681479,0.868405457050378 +0.374636142514646,0.197784703457728,21,2,0.140352239278254,0.0740972983518064,0.0391187889218614,0.327185241457712,0.423640210792266,0.655895375171089 +0.482126326300204,0.841961156809703,22,1,0.232445794511731,0.405931639420132,0.708898589576332,0.342427950053959,0.970229036922758,0.988479504839456 +0.660344187868759,0.746531683253124,24,2,0.436054446452051,0.492967858096082,0.557309554100743,0.294088642131774,0.996676477375078,0.82016804669243 +0.0772640188224614,0.437956433976069,22,2,0.00596972860459766,0.0338382741581451,0.191805838061035,0.427264688298837,0.444719649515999,1.02139489377063 +0.998469967395067,0.464829172473401,25,2,0.996942275789907,0.464117968683793,0.216066159582307,0.499709210945471,1.10136662168971,0.464831690595724 http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/bin/resources/donut.csv ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/bin/resources/donut.csv b/community/mahout-mr/examples/bin/resources/donut.csv new file mode 100644 index 0000000..33ba3b7 --- /dev/null +++ b/community/mahout-mr/examples/bin/resources/donut.csv @@ -0,0 +1,41 @@ +"x","y","shape","color","k","k0","xx","xy","yy","a","b","c","bias" +0.923307513352484,0.0135197141207755,21,2,4,8,0.852496764213146,0.0124828536260896,0.000182782669907495,0.923406490600458,0.0778750292332978,0.644866125183976,1 +0.711011884035543,0.909141522599384,22,2,3,9,0.505537899239772,0.64641042683833,0.826538308114327,1.15415605849213,0.953966686673604,0.46035073663368,1 +0.75118898646906,0.836567111080512,23,2,3,9,0.564284893392414,0.62842000028592,0.699844531341594,1.12433510339845,0.872783737128441,0.419968245447719,1 +0.308209649519995,0.418023289414123,24,1,5,1,0.094993188057238,0.128838811521522,0.174743470492603,0.519361780024138,0.808280495564412,0.208575453051705,1 +0.849057961953804,0.500220163026825,25,1,5,2,0.720899422757147,0.424715912147755,0.250220211498583,0.985454024425153,0.52249756970547,0.349058031386046,1 +0.0738831346388906,0.486534863477573,21,2,6,1,0.00545871758406844,0.0359467208248278,0.236716173379140,0.492112681164801,1.04613986717142,0.42632955896436,1 +0.612888508243486,0.0204555552918464,22,2,4,10,0.375632323536926,0.0125369747681119,0.000418429742297785,0.613229772009826,0.387651566219268,0.492652707029903,1 +0.207169560948387,0.932857288978994,23,2,1,4,0.0429192269835473,0.193259634985281,0.870222721601238,0.955584610897845,1.22425602987611,0.522604151014326,1 +0.309267645236105,0.506309477845207,24,1,5,1,0.0956464763898851,0.156585139973909,0.256349287355886,0.593292308854389,0.856423069092351,0.190836685845410,1 +0.78758287569508,0.171928803203627,25,2,4,10,0.620286786088131,0.135408181241926,0.0295595133710317,0.806130448165285,0.273277419610556,0.436273561610666,1 +0.930236018029973,0.0790199618786573,21,2,4,8,0.86533904924026,0.0735072146828825,0.00624415437530446,0.93358620577618,0.105409523078414,0.601936228937031,1 +0.238834470743313,0.623727766098455,22,1,5,1,0.0570419044152386,0.148967690904034,0.389036326202168,0.667890882268509,0.984077887735915,0.288991338582386,1 +0.83537525916472,0.802311758277938,23,2,3,7,0.697851823624524,0.670231393002335,0.643704157471036,1.15825557675997,0.819027144096042,0.451518508649315,1 +0.656760312616825,0.320640653371811,24,1,5,3,0.43133410822855,0.210584055746134,0.102810428594702,0.730851925374252,0.469706197095164,0.238209090579297,1 +0.180789119331166,0.114329558331519,25,2,2,5,0.0326847056685386,0.0206695401642766,0.0130712479082803,0.213906413126907,0.82715035810576,0.500636870310341,1 +0.990028728265315,0.061085847672075,21,2,4,8,0.980156882790638,0.0604767440857932,0.00373148078581595,0.991911469626425,0.06189432159595,0.657855445853466,1 +0.751934139290825,0.972332585137337,22,2,3,9,0.565404949831033,0.731130065509666,0.945430656119858,1.22916052895905,1.00347761677540,0.535321288127727,1 +0.136412925552577,0.552212274167687,23,2,6,1,0.0186084862578129,0.0753288918452558,0.304938395741448,0.5688118159807,1.02504684326820,0.3673168690368,1 +0.5729476721026,0.0981996888294816,24,2,4,10,0.328269034967789,0.0562632831160512,0.0096431788862070,0.581302170866406,0.43819729534628,0.408368525870829,1 +0.446335297077894,0.339370004367083,25,1,5,3,0.199215197417612,0.151472811718508,0.115171999864114,0.560702414192882,0.649397107420365,0.169357302283512,1 +0.922843366628513,0.912627586396411,21,2,3,7,0.851639879330248,0.842212314308118,0.832889111451739,1.29789405992245,0.915883320912091,0.590811338548155,1 +0.166969822719693,0.398156099021435,22,2,6,1,0.0278789216990458,0.0664800532683736,0.158528279187967,0.431749002184154,0.923291695753637,0.348254618269284,1 +0.350683249300346,0.84422400011681,23,2,1,6,0.122978741339848,0.296055215498298,0.712714162373228,0.914162405545687,1.06504760696993,0.375214144584023,1 +0.47748578293249,0.792779305484146,24,1,5,6,0.227992672902653,0.378540847371773,0.628499027203925,0.9254683679665,0.949484141121692,0.29364368150863,1 +0.384564548265189,0.153326370986179,25,2,2,5,0.147889891782409,0.0589638865954405,0.0235089760397912,0.414003463538894,0.634247405427742,0.365387395199715,1 +0.563622857443988,0.467359990812838,21,1,5,3,0.317670725433326,0.263414773476928,0.218425361012576,0.73218582781006,0.639414084578942,0.071506910079209,1 +0.343304847599939,0.854578266385943,22,2,1,6,0.117858218385617,0.293380861503846,0.730304013379203,0.920957236664559,1.07775346743350,0.387658506651072,1 +0.666085948701948,0.710089378990233,23,1,5,2,0.443670491058174,0.472980557667886,0.504226926154735,0.973600234805286,0.784681795257806,0.267809801016930,1 +0.190568120684475,0.0772022884339094,24,2,2,5,0.0363162086212125,0.0147122950193909,0.00596019333943254,0.205612261211838,0.813105258002736,0.523933195018469,1 +0.353534662164748,0.427994541125372,25,1,5,1,0.124986757351942,0.151310905505115,0.183179327233118,0.555127088678854,0.775304301713569,0.163208092002022,1 +0.127048352966085,0.927507144864649,21,2,1,4,0.0161412839913949,0.117838255119330,0.860269503774972,0.936168140755905,1.27370093893119,0.567322915045421,1 +0.960906301159412,0.891004979610443,22,2,3,7,0.923340919607862,0.856172299272088,0.793889873690606,1.31043152942016,0.891862204031343,0.604416671286136,1 +0.306814440060407,0.902291874401271,23,2,1,6,0.094135100629581,0.276836176215481,0.81413062661056,0.953029761990747,1.13782109627099,0.446272800849954,1 +0.087350245565176,0.671402548439801,24,2,6,4,0.00763006540029655,0.0586471774793016,0.450781382051459,0.677060889028273,1.13300968942079,0.446831795474291,1 +0.27015240653418,0.371201378758997,25,1,5,1,0.0729823227562089,0.100280945780549,0.137790463592580,0.459099974241765,0.81882108746687,0.263474858488646,1 +0.871842501685023,0.569787061074749,21,2,3,2,0.7601093477444,0.496764576755166,0.324657294968199,1.04152131169391,0.584021951079369,0.378334613738721,1 +0.686449621338397,0.169308491749689,22,2,4,10,0.471213082635629,0.116221750050949,0.0286653653785545,0.707020825728764,0.356341416814533,0.379631841296403,1 +0.67132937326096,0.571220482233912,23,1,5,2,0.450683127402953,0.383477088331915,0.326292839323543,0.881462402332905,0.659027480614106,0.185542747720368,1 +0.548616112209857,0.405350996181369,24,1,5,3,0.300979638576258,0.222382087605415,0.164309430105228,0.682121007359754,0.606676886210257,0.106404700508298,1 +0.677980388281867,0.993355110753328,25,2,3,9,0.459657406894831,0.673475283690318,0.986754376059756,1.20266860895036,1.04424662144096,0.524477152905055,1 http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/bin/resources/test-data.csv ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/bin/resources/test-data.csv b/community/mahout-mr/examples/bin/resources/test-data.csv new file mode 100644 index 0000000..ab683cd --- /dev/null +++ b/community/mahout-mr/examples/bin/resources/test-data.csv @@ -0,0 +1,61 @@ +"V1","V2","V3","V4","V5","V6","V7","V8","y" +1,-0.212887381184450,-0.955959589855826,-0.00326541907490505,0.0560086232868742,0.091264583618544,0.0172194710825328,-0.0237399208336878,1 +1,3.14702017427074,2.12881054220556,-0.00566925018709358,-0.055626039510634,-0.0630510476335515,-0.00155145331201058,0.108559859662683,0 +1,-2.16541417186635,-2.71847685293678,-0.00833554984263851,0.0433655514274994,-0.102555485096075,-0.156155728366877,-0.0241458595902909,1 +1,-4.33686585982661,-2.6857484867589,-0.0115524101901378,0.122387581992154,0.081766215557828,-0.0206167352421607,-0.0424490760296281,1 +1,2.34100936064648,2.10958510331364,-0.0129315842415535,0.173866353524092,-0.0299915285951044,0.108136400830407,-0.0063355720943443,0 +1,1.30317270786224,3.37038662087804,-0.0230504278644102,-0.131884713919903,0.086455020204179,0.17337860146005,-0.0524355492943794,0 +1,1.94943481762617,3.54806480367192,-0.029538920288902,-0.0720379027720258,0.214306548234308,-0.082665692089578,0.226607475768828,0 +1,3.14635496849369,1.76134258264267,-0.0318247859223975,-0.187198080297378,-0.08576487890296,0.153638925055934,-0.0691201521844938,0 +1,-1.26105438936697,-1.95583819596755,-0.0367826492102569,-0.0936093811581598,-0.0317225362744449,-0.0840334569992295,-0.0627566339884115,1 +1,2.40442001058194,3.23077413487565,-0.0452264569747572,0.0371989606630366,-0.17352653795031,0.102543062447842,-0.0551882772900301,0 +1,-2.20940227045733,-0.175769402031962,-0.0465958462590872,0.130789407148096,-0.140283147466875,0.0708851428212228,0.0605244763586474,1 +1,-1.64710385829030,-2.57691366099069,-0.0553070134425288,-0.0349011715152424,-0.0826092377112715,0.106766133325393,-0.0585587032435851,1 +1,-2.6523724984616,-4.16903830585265,-0.0568310036349303,-0.0291979248790545,-0.255996825268056,0.0401827924643623,0.0179311252387879,1 +1,2.34337447158977,0.28996735916551,-0.0625800583342644,0.0899232083837452,0.0255207970332586,-0.0343458209061299,0.0755898049986344,0 +1,3.67556867120403,1.36097809464341,-0.0956707962851342,0.0537771695881714,-0.0373171704803031,0.0463473815328367,-0.228499359561800,0 +1,1.96533061882493,2.92646586187099,-0.103334098736041,-0.0194013528907574,0.0253359438067293,0.00748464018133427,-0.239745502177878,0 +1,-1.95041601303593,-0.860607985906108,-0.103721968898869,-0.00972933741506002,0.0227857854969761,-0.0287381002832544,-0.130156656165122,1 +1,-1.51543545229533,-1.35683836829949,-0.106483722717291,0.103877046729912,0.00840497101030744,0.0258430051020969,0.168907472637671,1 +1,1.45074382041585,1.88231080047069,-0.107681637419817,-0.00626324733854461,-0.144385489192821,0.00088239451623517,-0.00299885969569744,0 +1,3.87956616310254,4.31276421460554,-0.129963535661731,-0.0640782960295875,-0.0324909886960640,0.0428280701443882,0.0329254937199428,0 +1,-2.88187391546093,-3.16731558128991,-0.136390769151814,-0.155408895734766,0.105626409419800,-0.0918345772196075,0.197828194781600,1 +1,-2.65024496288248,-1.81147577507541,-0.145438998990911,0.0691687502404964,0.0749439097959056,-0.0674149410216342,0.123896965825847,1 +1,-1.37426198993006,-2.08894064826135,-0.153236566384176,0.0213513951854753,-0.134553043562400,0.00287304090325258,0.0122158739075685,1 +1,1.65698424179346,2.49004336804714,-0.153862461770005,0.105220938080375,-0.0946233303225818,-0.122426312548592,-0.00538234276442917,0 +1,2.93315586503758,2.75229115279104,-0.168877592929163,-0.0349207806558679,0.0189964813847077,0.202397029441612,0.0426299706123943,0 +1,-3.84306960373604,-2.35606387141237,-0.179511886850707,-0.0916819865200809,0.0265829433229566,0.101658708455140,-0.0855390303406673,1 +1,2.28101644492271,1.37963780647481,-0.180898801743387,-0.0789829066843624,-0.0779025366072777,0.0442621459868237,-0.136195159617836,0 +1,1.70008372335953,2.71018350574622,-0.188985514267118,-0.195856534813112,-0.106263419324547,-0.0311178988395261,-0.121173036989233,0 +1,-2.05613043162767,-1.73770126734937,0.00630625444849072,-0.134595964087825,0.0708994966210059,0.0739139562742148,-0.00416084523004362,1 +1,2.39375626983328,3.2468518382106,0.00951905535238045,-0.140380515724865,0.0630970962358967,0.00183192220061040,-0.0773483294293499,0 +1,4.26863682432937,3.49421800345979,0.0109175198048448,-0.109995560295421,-0.111585866731122,0.154763193427948,-0.0186987535307691,0 +1,1.54495296452702,3.17243560853872,0.0117478311845783,0.115838636637105,-0.1715332868224,0.0927292648278796,-0.0885962242970987,0 +1,2.16883227993245,1.63879588167162,0.0158863105366749,-0.00488771308802354,0.0280782748001184,0.131946735985038,0.066416828384239,0 +1,1.86427271422921,3.32026821853873,0.0162473257475520,0.0355005599857545,-0.0988825269654524,0.0527023072810735,0.100841323212596,0 +1,-3.03828333997027,-1.43214405751321,0.0247204684728272,0.146197859364444,0.0141171187314724,-0.201738256450160,0.044002672456105,1 +1,2.08595761680696,0.225336429607513,0.0335964287149376,0.0576493862055925,0.121452048491972,0.0640240734436852,0.224720096669846,0 +1,-1.85256114614442,-2.22817393781734,0.0346230650580488,0.160185441442375,0.0114059982858295,0.00496408500928602,-0.094156048483371,1 +1,2.33572915427688,1.03334367238243,0.0357824515834720,-0.172284120406131,0.0329286256184980,-0.101030665525296,-0.00238851979619332,0 +1,-2.00334039609229,-2.98875026257892,0.0375804284421083,0.142856636546252,-0.0862220203147005,-0.0441603903572752,0.0147126239348866,1 +1,2.38346139581192,1.21051372282823,0.0405425233313353,-0.145245065311593,-0.0216697981922324,-0.0128934036902430,-0.0325085994141851,0 +1,-1.15629168023471,-1.37784639006639,0.0429948703549178,-0.00491267793152886,0.0263522850749959,-0.0442602193050815,0.0582704866256344,1 +1,2.13230915550664,1.32833684701498,0.0434112538719301,-0.0296522957829338,0.00247091583877657,-0.123872403365319,-0.136549696313901,0 +1,-1.88291252343724,-1.99980946454726,0.0472833199907535,-0.0365284873908706,-0.0209054390489622,-0.0891896486647233,0.0542966824787834,1 +1,-1.34787394136153,-2.57763619051754,0.0493154843443071,0.0384664637019124,-0.00780509859650452,-0.118550134827935,0.00573215142098708,1 +1,-1.81748193199251,-2.72113041015796,0.0551479875680516,-0.255723061179778,-0.217672946803948,0.145106553357089,0.0632886151091758,1 +1,-3.13049595715861,-0.0285946551309455,0.0724437318718333,-0.0360911974267016,-0.121364676014540,0.038351368519738,-0.0125375424386282,1 +1,-2.3836883021805,-1.40162632998805,0.0746620557343183,0.069222624188286,0.04657285528431,0.0932835769596473,0.00836816351062604,1 +1,-2.43800450243598,-0.965440038635416,0.0763675021411913,-0.122575769653323,0.045866930905471,-0.0493852614669876,0.128116802512532,1 +1,1.09024638837653,2.21814920469686,0.0769910502309598,-0.270152593833931,-0.252735856082821,0.0661674666715274,-0.000429289775969046,0 +1,3.17642151475607,1.18015379683312,0.0776648965451875,-0.117234850817615,0.0759455286430382,0.119280079276134,0.117056969569811,0 +1,-3.5501372839931,-4.02435741321994,0.0833451415432366,-0.0185864612285970,0.0553371588028254,0.0269699189958747,-0.0930023774668385,1 +1,-2.85922019599943,-2.07644295605507,0.0903467736346066,0.124804691516462,0.0673015037344841,0.0234043567104492,0.0866115903248345,1 +1,0.513249476607372,5.0165612245778,0.0934321220365115,-0.0387550539552360,0.070129320868753,0.0635055975927393,-0.00773489793089484,0 +1,1.30094323285406,2.74698316868320,0.094239413405751,-0.105600040230387,-0.0134676903839459,0.00834379403909127,0.0978349326557826,0 +1,1.62511731278249,3.01296963021698,0.104352029985773,-0.0065839083200722,0.068460830526483,-0.1202220553,0.121998460927858,0 +1,1.82917662184333,2.89388269168932,0.110781239485760,-0.262387884050666,-0.00517657837760664,-0.0224028641246511,-0.108606003593092,0 +1,-3.17279743572930,-2.86698187406046,0.110873139279243,-0.093614374710967,0.0925974010859032,-0.00747619041107016,-0.066394213442664,1 +1,-3.20104938765970,-1.68043245593876,0.123227179211642,-0.00179275501686146,-0.175893752209014,-0.0835732816974749,0.0560957582079696,1 +1,-1.89923900052239,-2.92427973445236,0.147975477003611,0.00819675018680998,0.00470753628896422,-0.0122227288860826,0.209903875101594,1 +1,0.148491843864120,-1.54734877494689,0.162479731968606,0.112962938668545,-0.0100535803565242,0.0422099301034027,0.0752974779385111,1 http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/bin/set-dfs-commands.sh ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/bin/set-dfs-commands.sh b/community/mahout-mr/examples/bin/set-dfs-commands.sh new file mode 100755 index 0000000..0ee5fe1 --- /dev/null +++ b/community/mahout-mr/examples/bin/set-dfs-commands.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# Requires $HADOOP_HOME to be set. +# +# Figures out the major version of Hadoop we're using and sets commands +# for dfs commands +# +# Run by each example script. + +# Find a hadoop shell +if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then + HADOOP="${HADOOP_HOME}/bin/hadoop" + if [ ! -e $HADOOP ]; then + echo "Can't find hadoop in $HADOOP, exiting" + exit 1 + fi +fi + +# Check Hadoop version +v=`${HADOOP_HOME}/bin/hadoop version | egrep "Hadoop [0-9]+.[0-9]+.[0-9]+" | cut -f 2 -d ' ' | cut -f 1 -d '.'` + +if [ $v -eq "1" -o $v -eq "0" ] +then + echo "Discovered Hadoop v0 or v1." + export DFS="${HADOOP_HOME}/bin/hadoop dfs" + export DFSRM="$DFS -rmr -skipTrash" +elif [ $v -eq "2" ] +then + echo "Discovered Hadoop v2." + export DFS="${HADOOP_HOME}/bin/hdfs dfs" + export DFSRM="$DFS -rm -r -skipTrash" +else + echo "Can't determine Hadoop version." + exit 1 +fi +echo "Setting dfs command to $DFS, dfs rm to $DFSRM." + +export HVERSION=$v http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/pom.xml ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/pom.xml b/community/mahout-mr/examples/pom.xml new file mode 100644 index 0000000..28a5795 --- /dev/null +++ b/community/mahout-mr/examples/pom.xml @@ -0,0 +1,199 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.mahout</groupId> + <artifactId>mahout-mr</artifactId> + <version>0.14.0-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + + <artifactId>mr-examples</artifactId> + <name>Mahout Examples</name> + <description>Scalable machine learning library examples</description> + + <packaging>jar</packaging> + <properties> + <mahout.skip.example>false</mahout.skip.example> + </properties> + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-dependency-plugin</artifactId> + <executions> + <execution> + <id>copy-dependencies</id> + <phase>package</phase> + <goals> + <goal>copy-dependencies</goal> + </goals> + <configuration> + <!-- configure the plugin here --> + </configuration> + </execution> + </executions> + </plugin> + + <!-- create examples hadoop job jar --> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-assembly-plugin</artifactId> + <executions> + <execution> + <id>job</id> + <phase>package</phase> + <goals> + <goal>single</goal> + </goals> + <configuration> + <skipAssembly>${mahout.skip.example}</skipAssembly> + <descriptors> + <descriptor>src/main/assembly/job.xml</descriptor> + </descriptors> + </configuration> + </execution> + </executions> + </plugin> + + + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-remote-resources-plugin</artifactId> + <configuration> + <appendedResourcesDirectory>../src/main/appended-resources</appendedResourcesDirectory> + <resourceBundles> + <resourceBundle>org.apache:apache-jar-resource-bundle:1.4</resourceBundle> + </resourceBundles> + <supplementalModels> + <supplementalModel>supplemental-models.xml</supplementalModel> + </supplementalModels> + </configuration> + </plugin> + + <plugin> + <artifactId>maven-source-plugin</artifactId> + </plugin> + + <plugin> + <groupId>org.mortbay.jetty</groupId> + <artifactId>maven-jetty-plugin</artifactId> + <version>6.1.26</version> + </plugin> + </plugins> + + </build> + + <dependencies> + + <!-- our modules --> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>mahout-hdfs</artifactId> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>mahout-mr</artifactId> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>mahout-hdfs</artifactId> + <type>test-jar</type> + <scope>test</scope> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>mahout-mr</artifactId> + <type>test-jar</type> + <scope>test</scope> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>mahout-math</artifactId> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>mahout-math</artifactId> + <type>test-jar</type> + <scope>test</scope> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>mahout-integration</artifactId> + </dependency> + + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-benchmark</artifactId> + </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-analyzers-common</artifactId> + </dependency> + + <dependency> + <groupId>com.carrotsearch.randomizedtesting</groupId> + <artifactId>randomizedtesting-runner</artifactId> + </dependency> + + <dependency> + <groupId>org.easymock</groupId> + <artifactId>easymock</artifactId> + </dependency> + + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + </dependency> + + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-api</artifactId> + </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-log4j12</artifactId> + </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>jcl-over-slf4j</artifactId> + </dependency> + <dependency> + <groupId>commons-logging</groupId> + <artifactId>commons-logging</artifactId> + </dependency> + <dependency> + <groupId>log4j</groupId> + <artifactId>log4j</artifactId> + </dependency> + + </dependencies> + + <profiles> + <profile> + <id>release.prepare</id> + <properties> + <mahout.skip.example>true</mahout.skip.example> + </properties> + </profile> + </profiles> +</project> http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/src/main/assembly/job.xml ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/src/main/assembly/job.xml b/community/mahout-mr/examples/src/main/assembly/job.xml new file mode 100644 index 0000000..0c41f3d --- /dev/null +++ b/community/mahout-mr/examples/src/main/assembly/job.xml @@ -0,0 +1,46 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<assembly + xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 + http://maven.apache.org/xsd/assembly-1.1.0.xsd"> + <id>job</id> + <formats> + <format>jar</format> + </formats> + <includeBaseDirectory>false</includeBaseDirectory> + <dependencySets> + <dependencySet> + <unpack>true</unpack> + <unpackOptions> + <!-- MAHOUT-1126 --> + <excludes> + <exclude>META-INF/LICENSE</exclude> + </excludes> + </unpackOptions> + <scope>runtime</scope> + <outputDirectory>/</outputDirectory> + <useTransitiveFiltering>true</useTransitiveFiltering> + <excludes> + <exclude>org.apache.hadoop:hadoop-core</exclude> + </excludes> + </dependencySet> + </dependencySets> +</assembly> + \ No newline at end of file http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/TasteOptionParser.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/TasteOptionParser.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/TasteOptionParser.java new file mode 100644 index 0000000..6392b9f --- /dev/null +++ b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/TasteOptionParser.java @@ -0,0 +1,75 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.example; + +import java.io.File; + +import org.apache.commons.cli2.CommandLine; +import org.apache.commons.cli2.Group; +import org.apache.commons.cli2.Option; +import org.apache.commons.cli2.OptionException; +import org.apache.commons.cli2.builder.ArgumentBuilder; +import org.apache.commons.cli2.builder.DefaultOptionBuilder; +import org.apache.commons.cli2.builder.GroupBuilder; +import org.apache.commons.cli2.commandline.Parser; +import org.apache.mahout.common.CommandLineUtil; +import org.apache.mahout.common.commandline.DefaultOptionCreator; + +/** + * This class provides a common implementation for parsing input parameters for + * all taste examples. Currently they only need the path to the recommendations + * file as input. + * + * The class is safe to be used in threaded contexts. + */ +public final class TasteOptionParser { + + private TasteOptionParser() { + } + + /** + * Parse the given command line arguments. + * @param args the arguments as given to the application. + * @return the input file if a file was given on the command line, null otherwise. + */ + public static File getRatings(String[] args) throws OptionException { + DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); + ArgumentBuilder abuilder = new ArgumentBuilder(); + GroupBuilder gbuilder = new GroupBuilder(); + + Option inputOpt = obuilder.withLongName("input").withRequired(false).withShortName("i") + .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create()) + .withDescription("The Path for input data directory.").create(); + + Option helpOpt = DefaultOptionCreator.helpOption(); + + Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(helpOpt).create(); + + Parser parser = new Parser(); + parser.setGroup(group); + CommandLine cmdLine = parser.parse(args); + + if (cmdLine.hasOption(helpOpt)) { + CommandLineUtil.printHelp(group); + return null; + } + + return cmdLine.hasOption(inputOpt) ? new File(cmdLine.getValue(inputOpt).toString()) : null; + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommender.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommender.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommender.java new file mode 100644 index 0000000..c908e5b --- /dev/null +++ b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommender.java @@ -0,0 +1,102 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.example.bookcrossing; + +import org.apache.mahout.cf.taste.common.Refreshable; +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.neighborhood.NearestNUserNeighborhood; +import org.apache.mahout.cf.taste.impl.recommender.GenericBooleanPrefUserBasedRecommender; +import org.apache.mahout.cf.taste.impl.similarity.CachingUserSimilarity; +import org.apache.mahout.cf.taste.impl.similarity.LogLikelihoodSimilarity; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood; +import org.apache.mahout.cf.taste.recommender.IDRescorer; +import org.apache.mahout.cf.taste.recommender.RecommendedItem; +import org.apache.mahout.cf.taste.recommender.Recommender; +import org.apache.mahout.cf.taste.similarity.UserSimilarity; + +import java.util.Collection; +import java.util.List; + +/** + * A simple {@link Recommender} implemented for the Book Crossing demo. + * See the <a href="http://www.informatik.uni-freiburg.de/~cziegler/BX/">Book Crossing site</a>. + */ +public final class BookCrossingBooleanRecommender implements Recommender { + + private final Recommender recommender; + + public BookCrossingBooleanRecommender(DataModel bcModel) throws TasteException { + UserSimilarity similarity = new CachingUserSimilarity(new LogLikelihoodSimilarity(bcModel), bcModel); + UserNeighborhood neighborhood = + new NearestNUserNeighborhood(10, Double.NEGATIVE_INFINITY, similarity, bcModel, 1.0); + recommender = new GenericBooleanPrefUserBasedRecommender(bcModel, neighborhood, similarity); + } + + @Override + public List<RecommendedItem> recommend(long userID, int howMany) throws TasteException { + return recommender.recommend(userID, howMany); + } + + @Override + public List<RecommendedItem> recommend(long userID, int howMany, boolean includeKnownItems) throws TasteException { + return recommend(userID, howMany, null, includeKnownItems); + } + + @Override + public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException { + return recommender.recommend(userID, howMany, rescorer, false); + } + + @Override + public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems) + throws TasteException { + return recommender.recommend(userID, howMany, rescorer, includeKnownItems); + } + + @Override + public float estimatePreference(long userID, long itemID) throws TasteException { + return recommender.estimatePreference(userID, itemID); + } + + @Override + public void setPreference(long userID, long itemID, float value) throws TasteException { + recommender.setPreference(userID, itemID, value); + } + + @Override + public void removePreference(long userID, long itemID) throws TasteException { + recommender.removePreference(userID, itemID); + } + + @Override + public DataModel getDataModel() { + return recommender.getDataModel(); + } + + @Override + public void refresh(Collection<Refreshable> alreadyRefreshed) { + recommender.refresh(alreadyRefreshed); + } + + @Override + public String toString() { + return "BookCrossingBooleanRecommender[recommender:" + recommender + ']'; + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderBuilder.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderBuilder.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderBuilder.java new file mode 100644 index 0000000..2219bce --- /dev/null +++ b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderBuilder.java @@ -0,0 +1,32 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.example.bookcrossing; + +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.eval.RecommenderBuilder; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.recommender.Recommender; + +final class BookCrossingBooleanRecommenderBuilder implements RecommenderBuilder { + + @Override + public Recommender buildRecommender(DataModel dataModel) throws TasteException { + return new BookCrossingBooleanRecommender(dataModel); + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderEvaluatorRunner.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderEvaluatorRunner.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderEvaluatorRunner.java new file mode 100644 index 0000000..b9814c7 --- /dev/null +++ b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderEvaluatorRunner.java @@ -0,0 +1,59 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.example.bookcrossing; + +import org.apache.commons.cli2.OptionException; +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.eval.IRStatistics; +import org.apache.mahout.cf.taste.eval.RecommenderIRStatsEvaluator; +import org.apache.mahout.cf.taste.example.TasteOptionParser; +import org.apache.mahout.cf.taste.impl.eval.GenericRecommenderIRStatsEvaluator; +import org.apache.mahout.cf.taste.model.DataModel; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; + +public final class BookCrossingBooleanRecommenderEvaluatorRunner { + + private static final Logger log = LoggerFactory.getLogger(BookCrossingBooleanRecommenderEvaluatorRunner.class); + + private BookCrossingBooleanRecommenderEvaluatorRunner() { + // do nothing + } + + public static void main(String... args) throws IOException, TasteException, OptionException { + RecommenderIRStatsEvaluator evaluator = new GenericRecommenderIRStatsEvaluator(); + File ratingsFile = TasteOptionParser.getRatings(args); + DataModel model = + ratingsFile == null ? new BookCrossingDataModel(true) : new BookCrossingDataModel(ratingsFile, true); + + IRStatistics evaluation = evaluator.evaluate( + new BookCrossingBooleanRecommenderBuilder(), + new BookCrossingDataModelBuilder(), + model, + null, + 3, + Double.NEGATIVE_INFINITY, + 1.0); + + log.info(String.valueOf(evaluation)); + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java new file mode 100644 index 0000000..3e2f8b5 --- /dev/null +++ b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.example.bookcrossing; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.util.regex.Pattern; + +import com.google.common.base.Charsets; +import com.google.common.io.Closeables; +import org.apache.mahout.cf.taste.similarity.precompute.example.GroupLensDataModel; +import org.apache.mahout.cf.taste.impl.model.file.FileDataModel; +import org.apache.mahout.common.iterator.FileLineIterable; + +/** + * See <a href="http://www.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip">download</a> for + * data needed by this class. The BX-Book-Ratings.csv file is needed. + */ +public final class BookCrossingDataModel extends FileDataModel { + + private static final Pattern NON_DIGIT_SEMICOLON_PATTERN = Pattern.compile("[^0-9;]"); + + public BookCrossingDataModel(boolean ignoreRatings) throws IOException { + this(GroupLensDataModel.readResourceToTempFile( + "/org/apache/mahout/cf/taste/example/bookcrossing/BX-Book-Ratings.csv"), + ignoreRatings); + } + + /** + * @param ratingsFile BookCrossing ratings file in its native format + * @throws IOException if an error occurs while reading or writing files + */ + public BookCrossingDataModel(File ratingsFile, boolean ignoreRatings) throws IOException { + super(convertBCFile(ratingsFile, ignoreRatings)); + } + + private static File convertBCFile(File originalFile, boolean ignoreRatings) throws IOException { + if (!originalFile.exists()) { + throw new FileNotFoundException(originalFile.toString()); + } + File resultFile = new File(new File(System.getProperty("java.io.tmpdir")), "taste.bookcrossing.txt"); + resultFile.delete(); + Writer writer = null; + try { + writer = new OutputStreamWriter(new FileOutputStream(resultFile), Charsets.UTF_8); + for (String line : new FileLineIterable(originalFile, true)) { + // 0 ratings are basically "no rating", ignore them (thanks h.9000) + if (line.endsWith("\"0\"")) { + continue; + } + // Delete replace anything that isn't numeric, or a semicolon delimiter. Make comma the delimiter. + String convertedLine = NON_DIGIT_SEMICOLON_PATTERN.matcher(line) + .replaceAll("").replace(';', ','); + // If this means we deleted an entire ID -- few cases like that -- skip the line + if (convertedLine.contains(",,")) { + continue; + } + if (ignoreRatings) { + // drop rating + convertedLine = convertedLine.substring(0, convertedLine.lastIndexOf(',')); + } + writer.write(convertedLine); + writer.write('\n'); + } + writer.flush(); + } catch (IOException ioe) { + resultFile.delete(); + throw ioe; + } finally { + Closeables.close(writer, false); + } + return resultFile; + } + + @Override + public String toString() { + return "BookCrossingDataModel"; + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModelBuilder.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModelBuilder.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModelBuilder.java new file mode 100644 index 0000000..9ec2eaf --- /dev/null +++ b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModelBuilder.java @@ -0,0 +1,33 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.example.bookcrossing; + +import org.apache.mahout.cf.taste.eval.DataModelBuilder; +import org.apache.mahout.cf.taste.impl.common.FastByIDMap; +import org.apache.mahout.cf.taste.impl.model.GenericBooleanPrefDataModel; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.model.PreferenceArray; + +final class BookCrossingDataModelBuilder implements DataModelBuilder { + + @Override + public DataModel buildDataModel(FastByIDMap<PreferenceArray> trainingData) { + return new GenericBooleanPrefDataModel(GenericBooleanPrefDataModel.toDataMap(trainingData)); + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommender.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommender.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommender.java new file mode 100644 index 0000000..c06ca2f --- /dev/null +++ b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommender.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.example.bookcrossing; + +import java.util.Collection; +import java.util.List; + +import org.apache.mahout.cf.taste.common.Refreshable; +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.neighborhood.NearestNUserNeighborhood; +import org.apache.mahout.cf.taste.impl.recommender.GenericUserBasedRecommender; +import org.apache.mahout.cf.taste.impl.similarity.CachingUserSimilarity; +import org.apache.mahout.cf.taste.impl.similarity.EuclideanDistanceSimilarity; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood; +import org.apache.mahout.cf.taste.recommender.IDRescorer; +import org.apache.mahout.cf.taste.recommender.RecommendedItem; +import org.apache.mahout.cf.taste.recommender.Recommender; +import org.apache.mahout.cf.taste.similarity.UserSimilarity; + +/** + * A simple {@link Recommender} implemented for the Book Crossing demo. + * See the <a href="http://www.informatik.uni-freiburg.de/~cziegler/BX/">Book Crossing site</a>. + */ +public final class BookCrossingRecommender implements Recommender { + + private final Recommender recommender; + + public BookCrossingRecommender(DataModel bcModel) throws TasteException { + UserSimilarity similarity = new CachingUserSimilarity(new EuclideanDistanceSimilarity(bcModel), bcModel); + UserNeighborhood neighborhood = new NearestNUserNeighborhood(10, 0.2, similarity, bcModel, 0.2); + recommender = new GenericUserBasedRecommender(bcModel, neighborhood, similarity); + } + + @Override + public List<RecommendedItem> recommend(long userID, int howMany) throws TasteException { + return recommender.recommend(userID, howMany); + } + + @Override + public List<RecommendedItem> recommend(long userID, int howMany, boolean includeKnownItems) throws TasteException { + return recommend(userID, howMany, null, includeKnownItems); + } + + @Override + public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException { + return recommender.recommend(userID, howMany, rescorer, false); + } + + @Override + public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems) + throws TasteException { + return recommender.recommend(userID, howMany, rescorer, false); + } + + @Override + public float estimatePreference(long userID, long itemID) throws TasteException { + return recommender.estimatePreference(userID, itemID); + } + + @Override + public void setPreference(long userID, long itemID, float value) throws TasteException { + recommender.setPreference(userID, itemID, value); + } + + @Override + public void removePreference(long userID, long itemID) throws TasteException { + recommender.removePreference(userID, itemID); + } + + @Override + public DataModel getDataModel() { + return recommender.getDataModel(); + } + + @Override + public void refresh(Collection<Refreshable> alreadyRefreshed) { + recommender.refresh(alreadyRefreshed); + } + + @Override + public String toString() { + return "BookCrossingRecommender[recommender:" + recommender + ']'; + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderBuilder.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderBuilder.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderBuilder.java new file mode 100644 index 0000000..bb6d3e1 --- /dev/null +++ b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderBuilder.java @@ -0,0 +1,32 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.example.bookcrossing; + +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.eval.RecommenderBuilder; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.recommender.Recommender; + +final class BookCrossingRecommenderBuilder implements RecommenderBuilder { + + @Override + public Recommender buildRecommender(DataModel dataModel) throws TasteException { + return new BookCrossingRecommender(dataModel); + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderEvaluatorRunner.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderEvaluatorRunner.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderEvaluatorRunner.java new file mode 100644 index 0000000..97074d2 --- /dev/null +++ b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderEvaluatorRunner.java @@ -0,0 +1,54 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.example.bookcrossing; + +import java.io.File; +import java.io.IOException; + +import org.apache.commons.cli2.OptionException; +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.eval.RecommenderEvaluator; +import org.apache.mahout.cf.taste.example.TasteOptionParser; +import org.apache.mahout.cf.taste.impl.eval.AverageAbsoluteDifferenceRecommenderEvaluator; +import org.apache.mahout.cf.taste.model.DataModel; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public final class BookCrossingRecommenderEvaluatorRunner { + + private static final Logger log = LoggerFactory.getLogger(BookCrossingRecommenderEvaluatorRunner.class); + + private BookCrossingRecommenderEvaluatorRunner() { + // do nothing + } + + public static void main(String... args) throws IOException, TasteException, OptionException { + RecommenderEvaluator evaluator = new AverageAbsoluteDifferenceRecommenderEvaluator(); + File ratingsFile = TasteOptionParser.getRatings(args); + DataModel model = + ratingsFile == null ? new BookCrossingDataModel(false) : new BookCrossingDataModel(ratingsFile, false); + + double evaluation = evaluator.evaluate(new BookCrossingRecommenderBuilder(), + null, + model, + 0.9, + 0.3); + log.info(String.valueOf(evaluation)); + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/README ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/README b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/README new file mode 100644 index 0000000..9244fe3 --- /dev/null +++ b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/README @@ -0,0 +1,9 @@ +Code works with BookCrossing data set, which is not included in this distribution but is downloadable from +http://www.informatik.uni-freiburg.de/~cziegler/BX/ + +Data set originated from: + +Improving Recommendation Lists Through Topic Diversification, + Cai-Nicolas Ziegler, Sean M. McNee, Joseph A. Konstan, Georg Lausen; + Proceedings of the 14th International World Wide Web Conference (WWW '05), May 10-14, 2005, Chiba, Japan. + To appear. \ No newline at end of file http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java new file mode 100644 index 0000000..033daa2 --- /dev/null +++ b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java @@ -0,0 +1,104 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.example.email; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Writable; +import org.apache.mahout.common.HadoopUtil; +import org.apache.mahout.common.Pair; +import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable; +import org.apache.mahout.math.map.OpenObjectIntHashMap; + +import java.io.IOException; +import java.util.regex.Pattern; + +public final class EmailUtility { + + public static final String SEPARATOR = "separator"; + public static final String MSG_IDS_PREFIX = "msgIdsPrefix"; + public static final String FROM_PREFIX = "fromPrefix"; + public static final String MSG_ID_DIMENSION = "msgIdDim"; + public static final String FROM_INDEX = "fromIdx"; + public static final String REFS_INDEX = "refsIdx"; + private static final String[] EMPTY = new String[0]; + private static final Pattern ADDRESS_CLEANUP = Pattern.compile("mailto:|<|>|\\[|\\]|\\=20"); + private static final Pattern ANGLE_BRACES = Pattern.compile("<|>"); + private static final Pattern SPACE_OR_CLOSE_ANGLE = Pattern.compile(">|\\s+"); + public static final Pattern WHITESPACE = Pattern.compile("\\s*"); + + private EmailUtility() { + } + + /** + * Strip off some spurious characters that make it harder to dedup + */ + public static String cleanUpEmailAddress(CharSequence address) { + //do some cleanup to normalize some things, like: Key: karthik ananth <[email protected]>: Value: 178 + //Key: karthik ananth [mailto:[email protected]]=20: Value: 179 + //TODO: is there more to clean up here? + return ADDRESS_CLEANUP.matcher(address).replaceAll(""); + } + + public static void loadDictionaries(Configuration conf, String fromPrefix, + OpenObjectIntHashMap<String> fromDictionary, + String msgIdPrefix, + OpenObjectIntHashMap<String> msgIdDictionary) throws IOException { + + Path[] localFiles = HadoopUtil.getCachedFiles(conf); + FileSystem fs = FileSystem.getLocal(conf); + for (Path dictionaryFile : localFiles) { + + // key is word value is id + + OpenObjectIntHashMap<String> dictionary = null; + if (dictionaryFile.getName().startsWith(fromPrefix)) { + dictionary = fromDictionary; + } else if (dictionaryFile.getName().startsWith(msgIdPrefix)) { + dictionary = msgIdDictionary; + } + if (dictionary != null) { + dictionaryFile = fs.makeQualified(dictionaryFile); + for (Pair<Writable, IntWritable> record + : new SequenceFileIterable<Writable, IntWritable>(dictionaryFile, true, conf)) { + dictionary.put(record.getFirst().toString(), record.getSecond().get()); + } + } + } + + } + + public static String[] parseReferences(CharSequence rawRefs) { + String[] splits; + if (rawRefs != null && rawRefs.length() > 0) { + splits = SPACE_OR_CLOSE_ANGLE.split(rawRefs); + for (int i = 0; i < splits.length; i++) { + splits[i] = ANGLE_BRACES.matcher(splits[i]).replaceAll(""); + } + } else { + splits = EMPTY; + } + return splits; + } + + public enum Counters { + NO_MESSAGE_ID, NO_FROM_ADDRESS + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java new file mode 100644 index 0000000..5cd308d --- /dev/null +++ b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java @@ -0,0 +1,61 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.example.email; + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.mahout.math.VarIntWritable; + +import java.io.IOException; + +/** + * Assumes the input is in the format created by {@link org.apache.mahout.text.SequenceFilesFromMailArchives} + */ +public final class FromEmailToDictionaryMapper extends Mapper<Text, Text, Text, VarIntWritable> { + + private String separator; + + @Override + protected void setup(Context context) throws IOException, InterruptedException { + super.setup(context); + separator = context.getConfiguration().get(EmailUtility.SEPARATOR); + } + + @Override + protected void map(Text key, Text value, Context context) throws IOException, InterruptedException { + //From is in the value + String valStr = value.toString(); + int idx = valStr.indexOf(separator); + if (idx == -1) { + context.getCounter(EmailUtility.Counters.NO_FROM_ADDRESS).increment(1); + } else { + String full = valStr.substring(0, idx); + //do some cleanup to normalize some things, like: Key: karthik ananth <[email protected]>: Value: 178 + //Key: karthik ananth [mailto:[email protected]]=20: Value: 179 + //TODO: is there more to clean up here? + full = EmailUtility.cleanUpEmailAddress(full); + + if (EmailUtility.WHITESPACE.matcher(full).matches()) { + context.getCounter(EmailUtility.Counters.NO_FROM_ADDRESS).increment(1); + } else { + context.write(new Text(full), new VarIntWritable(1)); + } + } + + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToDictionaryReducer.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToDictionaryReducer.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToDictionaryReducer.java new file mode 100644 index 0000000..72fcde9 --- /dev/null +++ b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToDictionaryReducer.java @@ -0,0 +1,43 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.example.email; + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.mahout.math.VarIntWritable; + +import java.io.IOException; + +/** + * Key: the string id + * Value: the count + * Out Key: the string id + * Out Value: the sum of the counts + */ +public final class MailToDictionaryReducer extends Reducer<Text, VarIntWritable, Text, VarIntWritable> { + + @Override + protected void reduce(Text key, Iterable<VarIntWritable> values, Context context) + throws IOException, InterruptedException { + int sum = 0; + for (VarIntWritable value : values) { + sum += value.get(); + } + context.write(new Text(key), new VarIntWritable(sum)); + } +}
