Feautre: Add grouping support for PageRank

MADLIB-1082

- Add grouping support for pagerank, which will compute a PageRank
probability distribution for the graph represented by each group.
- Add convergence test, so that PageRank computation terminates
if the pagerank value of no node changes beyond a threshold across
two consecutive iterations (or max_iters number of iterations are
done, whichever happens first). In case of grouping, the algorithm
terminates only after all groups have converged.
- Create a summary table apart from the output table that records
the number of iterations required for convergence. Iterations
required for convergence of each group is recorded when grouping
is used. This implementation also ensures that we don't compute
PageRank for groups that have already converged.
- Update design doc with PageRank module.

Closes #112


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/c6948930
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/c6948930
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/c6948930

Branch: refs/heads/latest_release
Commit: c6948930661344c629e8a1398040f3b0a80f2136
Parents: f3b906e
Author: Nandish Jayaram <njaya...@apache.org>
Authored: Fri Mar 31 17:03:50 2017 -0700
Committer: Nandish Jayaram <njaya...@apache.org>
Committed: Thu Apr 13 15:39:35 2017 -0700

----------------------------------------------------------------------
 doc/design/figures/pagerank_example.pdf         | 208 +++++++
 doc/design/modules/graph.tex                    | 147 ++++-
 doc/literature.bib                              |   7 +
 src/ports/postgres/modules/graph/pagerank.py_in | 583 ++++++++++++++++---
 .../postgres/modules/graph/pagerank.sql_in      | 183 ++++--
 .../postgres/modules/graph/test/pagerank.sql_in |  63 +-
 6 files changed, 1039 insertions(+), 152 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c6948930/doc/design/figures/pagerank_example.pdf
----------------------------------------------------------------------
diff --git a/doc/design/figures/pagerank_example.pdf 
b/doc/design/figures/pagerank_example.pdf
new file mode 100644
index 0000000..bc1b19c
--- /dev/null
+++ b/doc/design/figures/pagerank_example.pdf
@@ -0,0 +1,208 @@
+%PDF-1.4
+%����
+1 0 obj
+   << 
+      /Title ()
+      /Author ()
+      /Subject ()
+      /Keywords ()
+      /Creator (yExport 1.5)
+      /Producer (org.freehep.graphicsio.pdf.YPDFGraphics2D 1.5)
+      /CreationDate (D:20170407154415-07'00')
+      /ModDate (D:20170407154415-07'00')
+      /Trapped /False
+   >>
+endobj
+2 0 obj
+   << 
+      /Type /Catalog
+      /Pages 3 0 R
+      /ViewerPreferences 4 0 R
+      /OpenAction [5 0 R /Fit]
+   >>
+endobj
+4 0 obj
+   << 
+      /FitWindow true
+      /CenterWindow false
+   >>
+endobj
+5 0 obj
+   << 
+      /Parent 3 0 R
+      /Type /Page
+      /Contents 6 0 R
+   >>
+endobj
+6 0 obj
+   << 
+      /Length 7 0 R
+      /Filter [/ASCII85Decode /FlateDecode]
+   >>
+stream
+Gb!SpbDml/DVaJWj:BJBhN+L7"q5`2p;1Si!RE=&%6sh6Yg,6S,9j2HC82/+[q?K,-RDV(78\F\DLSGR
+^\@Bqk>1mS9>c7lJ,Jm2+)fJc^V&*S^5bDugQ9aj-]-0Y^V9[<gV4aiJ+L.As8LW"amX`TY:h0gTE"i4
+p](5RJ+o8Dp\4.UnSnEhs22C]b3&$D^WP]>lm/'m-/nk=li0oH5@ouV;jki$i$QkLkaOr6hu,V^.Ukm5
+Qa\mE%=^G,fs(/LMPL2Yhu<<KCkI4@kN=%<PPed?SR1W?r4i,Q/9jr%Bt6n9N_=ecoq$BE5HStG]F*DH
+52Pk3Y?@TB4@Y>mh:'tp)AqHYdt;OY\i-Usm`3EIV#),*42([m&U*IFrPMdaIarmJE5^3K$c@8)eqU?0
+r#4TW35+GWpAXkfgE!1n*peAEPKukpX/$]Kkk/"1hdZ+#h(E*&r#^ZC_nJ>Nr`]lV+'sHrl)2Y1as_rp
+*,A"9W5U'@Rd*6UA[e-[b.<XUj?,NTML8r\cuiLdXO`j/,G4o5gp9uR;=]"'--&Nb-ZgFI>TX_N5?t,<
+!1L*_&'D)h%iQ:*er\$i:%HsQY.W7j$0S2!g3<kfBW\Jh/lZ/ZEie6tRZ":j&Re1iX>j"%mEE<;Cb9XF
+P2mmgi39lZG(dU*eb@U_YRI17&?rT,I$TgPa627H6N5:6MjPJsc75lFGt-ElnMG0d=q*shKDMV_cC-#V
+r`Vs"J>XRVOs,ZI>R($C5)WZ^9oWg5d[cP>WCE4PCTqm<@1MZoTp4-d?KH&%EhA-\)=&I-q3kK&eZKk6
+eTF2@jaQHMj573Aqr==ET4e2HU`HVgHo]5aE(-O#,b</F:-3:UXKO4]B<CPUZJFc>YAg0AnWDp![l@Z_
+eQV-GjF#DSWV5d3jIF`RNenX8L#[b8rRemA?:4LpQelFs:@ES]gB[^4FOkF#0i67I,O$H9CE)*L^5L&1
+i1/\UN^9]4(or+Z"EUTW_"H?G-G"oZ3!@#ol&IsGDfbi%i<ai4grVAA,!Q$h*Fkt(f's[O6>&hD5)>9$
+`5`FhS[(6K`6\bPl02r"1j,)#KoD]3[FsCu/m,"@>_tJLCV9n;(%[NB,.e5S@%t=,R-r+Bd_8P1\OBlC
+"83?U(TtEHfWBe'pZ4LFk-p9p.;b)%K`0k93mJko=ZUbMMMG*/'.k0<e;CU+)LHa+D';7iPMs#T-,YAN
+d6hh:I[;teO)JZ[U<bj_6Pb]iU,pat(3Pjs/?56OCMf,pD*E2fX;1QPnt><Sn6)l95"<in72A,6k(8XG
+]r'*nf]'5`p2#0_T@8!hY0Z<]cFj)!d;ZV0`EP75Situk>9*:;s#7Hu5?%HFFX%'/9V:oGd;Y:LrB6al
+5J@@$][PNY`hf.`kZC#S_dsJ;?n8me_AgFdb8Z+mIiFgDFAH8]o'(O4HI-eQctBc;qA<A-FFWFeCaC6a
+6194h(5=VhiU[KfLmcP\c]-II@>o2&?<fBt0;5_,K3,`i)k7I93l_\C.C/<.I8@55L_ErDVm6kL"e+od
+e@/)$]9#o*8d[Y'Fn?i+a1[`m;&>uF@2)P`ei-`b\%(!T>$pgI\Q+KE<qI,.GSm&X58Y(=f1GKL_S:`3
+U:&8E5."rI0*j>V7.LZS=K`N%L9IndiPcoL,F<SP&Z%g^f4D8YLgIARr)f?P!l&M3qB5fL?`b2p(]GlD
+rM^UJ:ClKHIu]#iLXF[tf_9PUd*'/tP](`t:tTu1D3'hH:,hNd46/'lDWuYGCFJP]\'E%8q5!md4cFGh
+O"K)V0G;^r3p.PhB'+GA6S]$3\)HP'GPO8sdFr91%YDlCd-d\cQuP-X7+XO9FEPG:%r7hL)'[+jc36ZY
+9)C<.8/HGjk>u8MkhosDJ.YBnNq/TL(S"CGGWGWYg/a6bZgsrTWt;,P=dJdW6u\>P+@C6I&r4Pf3>sTZ
+l:]nLU**c*qn2!9_&n7Kc7Idr!)G+"(d%:OJL]#T)uRuH-Js-W2Hlc+C8Fm;fiRaX0Ur_;b;1Z6\4\G.
+>:[<j7Nh4X<j]G-'Sbf^rIF<UT*JrTV(m]P_gh$nAH0>&mhBN[<eNQsBE_TJp[Xt[M/mZj^)8B9BMH-H
+0X;?h'8P]K&4qc&6P*pSEl?c@[&<RC?AQidEbV/PkS@o!9s-3L>-YVBQkd1[m\su)2`b$hY?%dtN5R7o
+LJ>3OCSb<@4%fmc\FPmQn+/I5j'0k`\&kskG4ldkR8&J#QO=TT00TnA4Qr^A*2D<R>h$*h7o+\Dd`u0B
+BLPB+j","$b62q4E':dW>`D!SISSgUoQcCk+pHes%(=Knka"5!YS?]bq4/K)Hj'I".FX"-3$r;)+UJ:Z
+KB6c7LsO@/F87rXin,+kjN,-q\5;[5GTt6M,8ZA3d,"2N5Mfl#N<A;_D,qUIo#GY00kgJDlPS9:cI'N)
+S/>do%QZKt&*E=@e)X1;:*o[+i1khU$;6L7e4ls'.rMI+am9r1:<JDuEE:2oJPsO:!LY%=QM`X2WqG&*
+YZ9;2O//[pUUur-nl]\!>lKaEoLF=dC%l)t6Fnf5DL&'B/#5"(NBAOJ,^>Yo83BP+.j[BA_D;U(>r,@T
+%!>^i^pX:`8H\CD&c1E\VL#X.?>^T?/ekp-Gel>taDu9BkEd]d4=XH3frTT0T@&;!l</DKIDtLO8IC5b
+^Jb\-F(F!UK=NaOpouQVrBO"gCaaa!ja9VJ>%&^>#rcn>(clW&gP#teWfOV.#%`Uj:g[fZU6Wfmqa%g4
+liO]$1FVK/"@jYHIs>e=<ec\QH+Yi4D+Z=tKB%VQ/(Qo'ImUu0P]l9b<(8jS+"g3C<aQ:\?[Hbm1oR7o
+G"Gr?U,/*L,odZPFS-r_+:hFo;U4fF"^SRD<g2L*]N=*oAT0TnU^Z#=?DV7\"XcZDb9uDiP<X%XJ;SIK
+"HPer2bGI9;k#]d0mqHM6akdXb4$9G31C2'MmFg"IkKr>'2j=qS*p'Yg8s>-\$N0S(OY7^fB<+qf^HRr
+=jqQXrnF!L[/9;7[hH+PHa'G`pYS8UZg@[4jS+F>h1%UdpRfq-qp/l^qp.7-qp5%sqp06:J",;]hg\k\
+!k\IBg@]XhmB/N9p@k4@ir4!5r#=5ck9FOYS?Cb!5-+*9j2gZ'L]"pI!C#gi9b's:g9&.r]>\u8L$`N^
+MFq@nFn'd>l=IM>[t'["]Vq3k`,j5NR29u[6YU8BJS>C;A7Es.kpEal4tE3bXEWhU<@ll"9guZ/\6<1@
+.H!3rcYe<^l9njW6#?kgrT3X,MmHVQ\V\4DU#98pDjgI6O$"pW]B+-q>a"]>=Yl7.cI%atkW\'gh("Bk
+JTc>iV)=5Z#!1-Ud.(./X4/$21p'*^U,IuV&FSYu:c6TTWB+&hdRGRp7\i-EH,p590U]CD*amBUbW"fP
+SO&,EqG,o[[SBEL7P<ug7<_+H0e:_(/q6AahFc^cX3jF(i[*gN,&bG5Kp3]iKp+.*]6m!*=aF=%n]=U.
+(poK=U@nm6Y#L=&Cb*k'4J\sCA?'N@aK='pPrW*0*#FLg&]XY[-'#U[3P+h*(%f^(pT04\6JsE`"BA.j
+2rEQY<Z<nn"K<1@_aS_ip!d9]`Mn]GpJ4NX9&Q^r"h!(t3cj8Q2jbce(1/%&O)=S0LQ5l_A?t;I%M#Uj
+E9K8@<1WmF:cR[0cnP4_i/>KUT=37SgJ[g&.r=J2]-4Y[Z\Qh'Y@a-6AP4CX-b1fd'Xl)fCoNCN>AA8t
+QJ/EP98t1&V8dH+;TEZJ%j!NCl(O=p><87F<Oj]P:%gtfq&i(*AkO-Js2&A@D5>i_UDpYkAi3e-*18C,
+aT"ah+2aA>4:cTcnBu4%#q="ja\?DkD=Crl=rtm8EOTn$GNCTr/0#AbPPCY*:tB@`RgZE^k4BSJBi`#^
+705en+uhX<D:^rGSR&iBG\FSM*)qs+PQGm1#OU[4/`rJEk7JLt:<-0XOj<XjI%TqI\8ig1jB4Kq"4:C[
+]B-@?]]>;4@lHsQ+b/VYO3c:>BAq*:O'ffP`"d_qp08fsae^$)>JGefr&K/?ESk:B=icj]PYJ04Z.f'B
+,Gt^TWn^;.MR,q;0UUd3;W]rJi3t^5hr-Z6egZI1aQ49cUW\Fa1$`9idH#3Or+>7f>a4Bt84^Ar?-%T-
+9UcbpJrGJ*lmBQ\HpcmrH<_n=`h\>TX(s%qT-MZa)YOnZjCBDe,VsQ66#J)cH>A3>`/u:k[6'*^CC$E$
+7rWS5(Fs/J.oSTQ.g0b$kuoR:#(4D-X`ccQX7\OX_d+u60P*Q[7tJdi+)"9AMq'g$dfE25oI&s-`/3#6
+(o"-<DAQK6O`-VO)1XFDfna;fVW)\]Il4ig^34h)W%%j$Hgo[*XZUtJ=\pa:HqfN%aFiNH31[a:nLt]B
+[d,m76&s[,+$?6O@'.-]52EQ#^d*k-q%U=C(C,)bFj1:G`T_qB5-)tVd*e-;'NGUaE(SnZ&!p.$7;g<j
+#uBt(j)je:)O%jHiEf:I/.Tcf7CnO(Djm]<GWd"to"=4Cj%HGYe[&M]*63k)8Xic>naI$jb!QmYf;^s0
+Ofh\4hY0[D=GdBd1)*n,R$:<DUfI*o+LEUS2<J(r7fdUAJ^WpRK?)]KekXudqq:bY41*/8CK8pl1Tb[i
+VDhGBiEDN2BLf,*`jBJr<nF?\3[ZOVW1Tb`Na/V)9</Q^XJT!eZWp\AAT:GFR%//u3ZQ[je#L=c8iNlp
+.S-%0iE:C4XhllPHIJmWAGKJ#*+)lLb/7iuCd&6LlEp<X=nj=P\Bi$9+mni'n:%&GpK<%9`n`hpEKpD2
+Kjjg/5#OS&drh;%B9gg&*6Di5IbeWCCd&5QkNP/<cj67tDa*IJZZ)MqG:)4I%/3m+"oj=X*mck?'lYLY
+Y59b6X*b]G83Y3qr]fs-q>&#^',K4U@=3_@Pp!eaLe85NcJ0m84@;sPi_u2B"9l));W+Fu2qkAH'Qbo?
+k-H[eYVU0^+Qq?4@?*JaL>GuXM!$`Q@6jQZ&EemB"OL-T$5EF4>9t[e0"6i78l3aCFa&dK-0QfX%m*#M
+8e<**6mW-%mA^<ZZa3j^#l(D&d1Sko-r>,eY$32#7[EgsmT'YRXAK9J7o9R:^UHfH[.1.!;60ROZ9Kg)
+^fMh<YG*ZF40hITi1XdprDj43\lK..=]Gd6o,SGp?ReA>bH2DqQYTG)!j/L9'WCmZ[s"eT`CQ9b9ep%7
+i:Epo[AI.S.$3nX@pO;)^_IiS09r22etWl=fIK1glpiACnWa])GauNe'F]<u]iBJAoF87!IIt"=I:17O
+[lTo@n=72PMr%pGoddarfZKth!*h,64;,NO'U(%PKkP26KsuD=.^J=C#7tW1d%n"CNULX*H%$XXg2Pkd
+p`d)QlrBf.VYpO%$8TBUkc^?MMs:/,N>Y*&ZBDDJ45'>V*]nOlgs`A\d<`N,-@kNLM:TkP1')Js-3C_@
+F"D\),@uq,Y1mHUea0P\AG4[N%=;nNlu>h1aop'kR/jg"-9+e#-@i8A7%iI&h^T9o;YJc-\ioaiAWMfq
+Zc<UPs"H[d`ZprgfWo5!oil!ZT5-i+!hX8`.hf^"A\pVKZ##jRcUJ(=M6bZ9A]!\X2U,l72U+6[2U2'j
+Cg(5iVP3`aRAcG5=#LNO?CB_6OVnD8.MIAbs)tG:pdeuFS??MRHrah-S0MUoT(6O&!.0-p9[pLhPmO+a
+1/ZeP<$q)MkGh2U?'U73T(R-Jb(PgTL7NF)PmP_SPmMIF=_@5OX%_#LDNtO$X"o7X62/MimHmrB!4jWn
+JZLU=GjG<1]2^2JY#crt<`LL33D3Fbi[NcHpoQ<&d8sn2T*[ns_N@nH!o"()XYLKOZ-!41G#]k3j/XpI
+.!B7Z.MEe)/rnAHb!bXW;4.]+PmG*Jbhp7!<QD_^HY"D*jl,_jmTu<#m!4*Lms4rKQM8GSa*VC7nO7@[
+.MIAO*,aUi(Pda'i]+Y(95t.Hg9Nn1Kb+0G]`8&hg-s$X'cHknZOZdT)r@BGm6Y)up'm4kfo/B*Iobr]
+;lgKlV!g:(#N]b$BCQMHV&`KYPdb7rF3$:[2+dHDj2#\7XO#G^P\#h[E?[+JF<SpWGZn1LXbL2.ee9fZ
+euDgo\F5=`-('l'PE*4`rP1c28b=;#+Lq0PCR1r%+#lpMfYYPD:o/k@H0&OC4Ap+;Dj;'dpJ@'AAF(kk
+f;)%1j5W^AFU[$fY#?#q@eeA$rLCuPs#KfMDm!;%#u8<:9nH#WY9m7rojd`g'$Gb()2e=/EBMXZW_#rE
+mQ[2kmQ]aQmQ]_jmQ\a@ruHJeP<eB4-3;r'8Xq0.8QBIC^],96s+]q?Z]T_1p7n);ZeG@j*81;bOT,jK
+;,#\h\k6<+lZ2^#p;bcSBP6^nc`c7JZ1]Q#X_k9?l1YaX^-&!/?sB,#r>Fm,4#>Ykqm2Db[6LEj]_87+
+Rk!+2;+k6mlhZPaC4C@(PLog+g:CkGrY1O'XP+aLH-Bd+5Q8.q\7"]"mW<6Ed2<;j%8YORCueL]cff8M
+rRn8V[6JZ\PLoddq6ME.=(R6UH:K\i)%f?_'Y2-,=<CPkV&doEMQ'sGBT2U]F%!^?1"HkWX/HXeerPsi
+VHT8jEmf;i3F(]k%0;J9AWE)i;q]f<Z$qh)n#\*Sm2EP1C?2,$o^/$j]'*/(r#JqA+.0<^N+B"3F=l)<
+=Z_U(3,7cBi*F&?nLQic3sqLrjqtJ:AqK_:Z<0>qaj!:1\8]PHMtJ5P<0Ot;0MC+:lN`='1!Eal,>3"J
+i0PrrqX1%]*:11<CgWLB/bt9N7+)5Rc[D]6gar.ORn<NrZ,QRehdZQ"(@.;#XNo1\$X`NEU6fffNPiG7
+o'o@0=Vc[&?_:8"n9iTSNq=hT4=A]`A8UBp_!Kf9Vp0sKQ*h9G#k3fWJr4qGnh;%Z^He/Y=RMi[G3ooS
+D'j>nOSSc-h0u+G.<IM>j=ecBcrd?=+t-/RV@36SK0:OFKhEEl!B;,Voa>njJJ"9a%g,5uUl,\n[(8rl
+I8JcoL!V=M8)$eHIb=r4"!NkH^06qK_8mllW!QK(-(H=:deM'^14,U\"A*4k'qX/5bJX_@>FM`/j;Ln&
+o"0@tLJPm+FC>mCGj+mVoNbPa\qUTVL96r;REVBa3kb=%Rmb2m3IJ8Nk_$VP`3I8=&]rMA0>dL28YW9C
+Ck,K]?2XX;eQj-[68q=40C\&+@UNa+5*u6ig:qU@9OjB\+.e5S\d*mfd%uR:Q[?*n8m,!u3@s&jPEMLu
+?P;2fr@))F]Ka&c^ueA,&SfM&#rk7_\2Q_*[9_XBKbENB!:a#`GP^Zbb'.m]'HGarH4rTRpobfS<d>A(
+Oh;"`f=_L2mS32;N2Cq8FR-f5!8@,:-9+fkgPh0hE,Qpsn8EQ_'5t?>Hp=NEhF,T!4ZLO3]nrZbJ)pKT
+rQR:VAJb?af0YO*0D^C3;Gr(74`K<77sKBcp)hQ*=5j[o&1r3,bb_ZLkAU:R)56q<O/eXFYHY6W-&_\>
+BB8j0=1%n@d([9oJaYZ=c7sqN7Gi=\q0gd<CX0[4Znrd1oVha@7+&,+97Y&(9t-^#K]0JdrpfnnIfH5/
+lX.Pss7tCUkb4G7Xm0=(0uD[LrAF.Y&;LK&<F>Gebdca'1G`3!ZTa;hP%+sJK=;VBE.+>Jl;PHN_5p8\
+-aWDGbl?1qVN9bTqY0p5qA6m%\a,l;7-u53rR^9U@p:8:#4q,+X[449o:lN#.A9pI#u1?1Fo1q3HAd>E
+7*<GP#^IgP3rZD^6,!'*0tb@dg^gHC9E)Hi-@[174+n2u<VoSSAs&b+LK;:9`.:8PZ2S6U\6TKW:=f/u
+^h<;dq.&8kErO0%X0UA7\Em"OJ<tHA<t5S`*Q2=`(&JMie!HoR'&p:a4(dRi.UN:ec1pC.]I0]I*S7E2
+'.-_*.V=,$1(UnZV<c'6`&F$2prcffS][R0ddK`cXj!Zir8cVoU9@WKqcA-F=#GKTc]Bse>mG#fho;nj
+M-i?Oc:#pT^?hf2!VH$r?L/h5i_'KnM7?.7nC7mjX0Q\nIc4?Z;pd,_$V''.9c]C8hFmUncnF:(,O[ks
+J,`[55G.uU2e3Y@ao~>
+endstream
+endobj
+7 0 obj
+   8038
+endobj
+3 0 obj
+   << 
+      /Parent null
+      /Type /Pages
+      /MediaBox [0.0000 0.0000 331.00 240.00]
+      /Resources 8 0 R
+      /Kids [5 0 R]
+      /Count 1
+   >>
+endobj
+9 0 obj
+   [/PDF /Text /ImageC]
+endobj
+10 0 obj
+   << 
+      /S /Transparency
+      /CS /DeviceRGB
+      /I true
+      /K false
+   >>
+endobj
+11 0 obj
+   << 
+      /Alpha1
+      << 
+         /ca 1.0000
+         /CA 1.0000
+         /BM /Normal
+         /AIS false
+      >>
+   >>
+endobj
+8 0 obj
+   << 
+      /ProcSet 9 0 R
+      /ExtGState 11 0 R
+   >>
+endobj
+xref
+0 12
+0000000000 65535 f 
+0000000015 00000 n 
+0000000315 00000 n 
+0000008780 00000 n 
+0000000445 00000 n 
+0000000521 00000 n 
+0000000609 00000 n 
+0000008757 00000 n 
+0000009234 00000 n 
+0000008950 00000 n 
+0000008989 00000 n 
+0000009091 00000 n 
+trailer
+<< 
+   /Size 12
+   /Root 2 0 R
+   /Info 1 0 R
+>>
+startxref
+9307
+%%EOF

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c6948930/doc/design/modules/graph.tex
----------------------------------------------------------------------
diff --git a/doc/design/modules/graph.tex b/doc/design/modules/graph.tex
index 5c3910c..1d0233c 100644
--- a/doc/design/modules/graph.tex
+++ b/doc/design/modules/graph.tex
@@ -22,11 +22,12 @@
 \chapter[Graph]{Graph}
 
 \begin{moduleinfo}
-\item[Author] \href{mailto:okis...@pivotal.io}{Orhan Kislal}
+\item[Authors] \href{mailto:okis...@pivotal.io}{Orhan Kislal}, 
\href{mailto:njaya...@pivotal.io}{Nandish Jayaram}
 \item[History]
        \begin{modulehistory}
                \item[v0.1] Initial version, SSSP only.
                \item[v0.2] Graph Framework, SSSP implementation details.
+        \item[v0.3] PageRank
        \end{modulehistory}
 \end{moduleinfo}
 
@@ -272,3 +273,147 @@ of the algorithm.
 Please note that, for ideal performance, \emph{vertex} and \emph{edge} tables
 should be distributed on \emph{vertex id} and \emph{source id} respectively.
 
+\section{PageRank} \label{sec:graph:pagerank}
+\begin{figure}[h]
+    \centering
+    \includegraphics[width=0.5\textwidth]{figures/pagerank_example.pdf}
+\caption{An example graph for PageRank}
+\label{pagerank:example}
+\end{figure}
+
+PageRank is a link analysis algorithm that assigns a score to every vertex
+measuring the relative importance of vertices within the set of all
+vertices. PageRank~\cite{pagerank} was first used by Google to measure the
+importance of website pages where the World Wide Web was modeled as a directed
+graph. Figure~\ref{pagerank:example} shows an example graph with the PageRank
+value of each vertex. The intuition behind the algorithm is that the number and
+quality of links to a vertex determine the authoritativeness of the vertex,
+which is reflected in the PageRank scores as shown in the figure.
+
+The pagerank module in MADlib implements the model of a random surfer who
+follows the edges of a graph to traverse it, and jumps to a random vertex
+after several clicks. The random surfer is modeled using a damping factor
+that represents the probability with which the surfer will continue to follow
+links in the graph rather than jumping to a random vertex. MADlib's pagerank
+module outputs a probability distribution that represents the likelihood that
+the random surfer arrives at a particular vertex in the graph.
+
+PageRank is an iterative algorithm where the PageRank scores of vertices from
+the previous iteration are used to compute the new PageRank scores. The
+PageRank score of a vertex $v$, at the $i^{th}$ iteration, denoted by $PR(v_i)$
+is computed as:
+
+\begin{equation}
+PR(v_i) = \frac{1-d}{N} + d \sum_{u \in M(v)}(\frac{PR(u_{i-1})}{L(u)})
+\label{eq:pagerank}
+\end{equation}
+
+where $N$ is the number of vertices in the graph, $d$ is the damping factor,
+$M(v)$ represents the set of vertices that have an edge to vertex $v$,
+$L(u)$ represents the out-degree of vertex $u$, i.e., the number of
+out-going edges from vertex $u$, and $PR(u_{i-1})$ represents the PageRank
+score of vertex $u$ in the $(i-1)^{st}$ iteration.
+
+$\frac{1-d}{N}$ represents the tiny probability with which the surfer
+would randomly jump to vertex $v$, rather than arriving at $v$ following
+links in the graph. This ensures that there is some probability of visiting
+every vertex in the graph even if they do not have any incoming edges. Note
+that the PageRank score computed for a vertex $v$ using~\ref{eq:pagerank}
+in the $i^{th}$ iteration is not updated until the new score is computed for
+all the vertices in the graph. The computation terminates either when the
+PageRank score of no vertex changes beyond a threshold across two consecutive
+iterations, or when a pre-set number of iterations are completed.
+
+\subsection{Implementation Details} \label{sec:pagerank:implementation}
+
+In this section, we discuss the MADlib implementation of PageRank in depth.
+We maintain two tables at every iteration: $previous$ and $cur$. The
+$previous$ table maintains the PageRank scores of all vertices computed in
+the previous iteration, while $cur$ maintains the updated scores of all
+vertices in the current iteration.
+
+\begin{algorithm}[PageRank$(V,E)$] \label{alg:pagerank:high}
+\begin{algorithmic}[1]
+    \State Create $previous$ table with a default PageRank score of
+            $\frac{1}{N}$ for every vertex
+    \Repeat
+        \State Create empty table $cur$.
+        \State Update $cur$ using PageRank scores of vertices in $previous$
+        \State Update PageRank scores of vertices without incoming edges
+        \State Drop $previous$ and rename $cur$ to $previous$
+    \Until {PageRank scores have converged or \emph{max} iterations have 
elapsed}
+\end{algorithmic}
+\end{algorithm}
+
+The implementation consists of updating the PageRank scores of all vertices
+at every iteration, using the PageRank scores of vertices from the previous
+iteration. The PageRank score of every vertex is initialized to $\frac{1}{N}$
+where $N$ is the total number of vertices in the graph. The out-degree of
+every vertex in the graph (represented by $L(u)$ in eq.~\ref{eq:pagerank}),
+is captured in table $out\_cnts$. The following query is used to create and
+update the PageRank scores in $cur$ table using the PageRank scores in
+$previous$ table.
+
+\begin{algorithm}[Update PageRank scores$(previous,out\_cnts,d,N)$]
+\label{alg:pagerank:update}
+\begin{lstlisting}
+CREATE TABLE cur AS
+    SELECT edge_table.dest AS id,
+        SUM(previous1.pagerank/out_cnts.cnt)*d + (1-d)/N AS pagerank
+    FROM edge_table
+        INNER JOIN previous ON edge_table.dest = previous.id
+        INNER JOIN out_cnts ON edge_table.src = out_cnts.id
+        INNER JOIN previous AS previous1 ON edge_table.src = previous1.id
+    GROUP BY edge_table.dest
+
+-- Update PageRank scores of vertices without any incoming edges:
+INSERT INTO cur
+    SELECT id, (1-d)/N AS pagerank
+    FROM previous
+    WHERE id NOT IN (
+        SELECT id
+        FROM cur
+    )
+\end{lstlisting}
+\end{algorithm}
+
+The PageRank computation is terminated either when a fixed number of iterations
+are completed, or when the PageRank scores of all vertices have converged. The
+PageRank score of a vertex is deemed converged if the absolute difference in
+its PageRank scores from $previous$ and $cur$ is less than a specified 
threshold.
+The following query is used to find all the vertices whose PageRank scores have
+not converged yet.
+
+\begin{algorithm}[Update PageRank scores$(previous,cur,threshold)$]
+\label{alg:pagerank:update}
+\begin{lstlisting}
+SELECT id
+FROM cur
+INNER JOIN previous ON cur.id = previous.id
+WHERE ABS(previous.pagerank - cur.pagerank) > threshold
+\end{lstlisting}
+\end{algorithm}
+
+\subsection{Best Practices} \label{sec:pagerank:bestpractices}
+
+The pagerank module in MADlib has a few optional parameters: damping factor
+$d$, number of iterations $max$, and the threshold for convergence $threshold$.
+The default values for these parameters when not specified by the user are
+$0.85$, $100$ and $\frac{1}{N*100}$ respectively.
+
+The damping factor denotes the probability with which the surfer uses the edges
+to traverse the graph. If set to $0$, it implies that the only way a surfer
+would visit a vertex in the graph is by randomly jumping to it. If set to
+$1$, it implies that the only way the surfer can reach a vertex is by following
+the edges in the graph, thus precluding the surfer from reaching a vertex
+that has no incoming edges. It is common practice to set damping factor
+to $0.85$~\cite{pagerank}, and the maximum number of iterations to $100$.
+The convergence test for PageRank in MADlib checks for the delta between
+the PageRank scores of a vertex across two consecutive iterations. Since
+the initial value of the PageRank score is set to $\frac{1}{N}$, the delta
+will be small in the initial iterations when $N$ is large (say over 100
+million). We thus set the threshold to $\frac{1}{N*100}$, and it is to be
+noted that this is not based on any experimental study. Users of MADlib are
+encouraged to consider this factor when setting a value for threshold, since
+a high $threshold$ value would lead to early termination of PageRank
+computation, thus resulting in incorrect PageRank values.

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c6948930/doc/literature.bib
----------------------------------------------------------------------
diff --git a/doc/literature.bib b/doc/literature.bib
index 0353c23..c98a131 100644
--- a/doc/literature.bib
+++ b/doc/literature.bib
@@ -907,3 +907,10 @@ Applied Survival Analysis},
   year={1956},
   institution={DTIC Document}
 }
+
+@inproceedings{pagerank,
+       booktitle = {Seventh International World-Wide Web Conference (WWW)},
+           title = {The Anatomy of a Large-Scale Hypertextual Web Search 
Engine},
+          author = {S. Brin and L. Page},
+            year = {1998}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c6948930/src/ports/postgres/modules/graph/pagerank.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/pagerank.py_in 
b/src/ports/postgres/modules/graph/pagerank.py_in
index 13cdcc5..202e536 100644
--- a/src/ports/postgres/modules/graph/pagerank.py_in
+++ b/src/ports/postgres/modules/graph/pagerank.py_in
@@ -31,33 +31,37 @@ import plpy
 from utilities.control import MinWarning
 from utilities.utilities import _assert
 from utilities.utilities import extract_keyvalue_params
-from utilities.utilities import unique_string
-from utilities.control import IterationController2S
+from utilities.utilities import unique_string, split_quoted_delimited_str
+from utilities.validate_args import columns_exist_in_table, get_cols_and_types
 from graph_utils import *
 
-import time
-
 m4_changequote(`<!', `!>')
 
-def validate_pagerank_args(vertex_table, vertex_id, edge_table, edge_params,
-        out_table, damping_factor, max_iter, threshold, module_name):
+def validate_pagerank_args(schema_madlib, vertex_table, vertex_id, edge_table,
+        edge_params, out_table, damping_factor, max_iter, threshold,
+        grouping_cols_list, module_name):
     """
     Function to validate input parameters for PageRank
     """
     validate_graph_coding(vertex_table, vertex_id, edge_table, edge_params,
         out_table, module_name)
     _assert(damping_factor >= 0.0 and damping_factor <= 1.0,
-        """PageRank: Invalid damping factor value ({0}), must be between 0 and 
1."""
-        .format(damping_factor))
-    _assert(threshold >= 0.0 and threshold <= 1.0,
-        """PageRank: Invalid threshold value ({0}), must be between 0 and 1."""
-        .format(threshold))
+        """PageRank: Invalid damping factor value ({0}), must be between 0 and 
1.""".
+        format(damping_factor))
+    _assert(not threshold or (threshold >= 0.0 and threshold <= 1.0),
+        """PageRank: Invalid threshold value ({0}), must be between 0 and 
1.""".
+        format(threshold))
     _assert(max_iter > 0,
-        """PageRank: Invalid max_iter value ({0}), must be a positive integer. 
"""
-        .format(max_iter))
+        """PageRank: Invalid max_iter value ({0}), must be a positive 
integer.""".
+        format(max_iter))
+    if grouping_cols_list:
+        # validate the grouping columns. We currently only support 
grouping_cols
+        # to be column names in the edge_table, and not expressions!
+        _assert(columns_exist_in_table(edge_table, grouping_cols_list, 
schema_madlib),
+                "PageRank error: One or more grouping columns specified do not 
exist!")
 
 def pagerank(schema_madlib, vertex_table, vertex_id, edge_table, edge_args,
-    out_table, damping_factor, max_iter, threshold, **kwargs):
+    out_table, damping_factor, max_iter, threshold, grouping_cols, **kwargs):
     """
     Function that computes the PageRank
 
@@ -87,66 +91,278 @@ def pagerank(schema_madlib, vertex_table, vertex_id, 
edge_table, edge_args,
         damping_factor = 0.85
     if max_iter is None:
         max_iter = 100
-    if threshold is None:
-        threshold = 0.00001
     if vertex_id is None:
         vertex_id = "id"
-    validate_pagerank_args(vertex_table, vertex_id, edge_table, edge_params,
-        out_table, damping_factor, max_iter, threshold, 'PageRank')
+    if not grouping_cols:
+        grouping_cols = ''
+
+    grouping_cols_list = split_quoted_delimited_str(grouping_cols)
+    validate_pagerank_args(schema_madlib, vertex_table, vertex_id, edge_table,
+        edge_params, out_table, damping_factor, max_iter, threshold,
+        grouping_cols_list, 'PageRank')
+    summary_table = out_table + "_summary"
+    _assert(not table_exists(summary_table),
+        "Graph PageRank: Output summary table ({summary_table}) already 
exists."
+        .format(**locals()))
     src = edge_params["src"]
     dest = edge_params["dest"]
+    nvertices = plpy.execute("""
+                SELECT COUNT({0}) AS cnt
+                FROM {1}
+            """.format(vertex_id, vertex_table))[0]["cnt"]
+    # A fixed threshold value, of say 1e-5, might not work well when the
+    # number of vertices is a billion, since the initial pagerank value
+    # of all nodes would then be 1/1e-9. So, assign default threshold
+    # value based on number of nodes in the graph.
+    # NOTE: The heuristic below is not based on any scientific evidence.
+    if threshold is None:
+        threshold = 1.0/(nvertices*100)
+
+    # table/column names used when grouping_cols is set.
+    distinct_grp_table = ''
+    vertices_per_group = ''
+    vpg = ''
+    grouping_where_clause = ''
+    group_by_clause = ''
+    random_prob = ''
 
     edge_temp_table = unique_string(desp='temp_edge')
     distribution = m4_ifdef(<!__POSTGRESQL__!>, <!''!>,
         <!"DISTRIBUTED BY ({0})".format(dest)!>)
-    plpy.execute("""
-        DROP TABLE IF EXISTS {edge_temp_table};
-        CREATE TEMP TABLE {edge_temp_table} AS
+    plpy.execute("DROP TABLE IF EXISTS {0}".format(edge_temp_table))
+    plpy.execute("""CREATE TEMP TABLE {edge_temp_table} AS
         SELECT * FROM {edge_table}
         {distribution}
         """.format(**locals()))
     # GPDB and HAWQ have distributed by clauses to help them with indexing.
-    # For Postgres we add the indices manually.
+    # For Postgres we add the index explicitly.
     sql_index = m4_ifdef(<!__POSTGRESQL__!>,
         <!"""CREATE INDEX ON {edge_temp_table} ({src});
         """.format(**locals())!>,
         <!''!>)
     plpy.execute(sql_index)
 
-    nvertices = plpy.execute("""
-            SELECT COUNT({0}) AS cnt
-            FROM {1}
-        """.format(vertex_id, vertex_table))[0]["cnt"]
-    init_value = 1.0/nvertices
-    random_prob = (1.0-damping_factor)/nvertices
+    # Intermediate tables required.
     cur = unique_string(desp='cur')
     message = unique_string(desp='message')
-    plpy.execute("""
-            CREATE TEMP TABLE {cur} AS
-            SELECT {vertex_id}, {init_value}::DOUBLE PRECISION AS pagerank
-            FROM {vertex_table}
-        """.format(**locals()))
-    v1 = unique_string(desp='v1')
-
+    cur_unconv = unique_string(desp='cur_unconv')
+    message_unconv = unique_string(desp='message_unconv')
     out_cnts = unique_string(desp='out_cnts')
     out_cnts_cnt = unique_string(desp='cnt')
-    # Compute the out-degree of every node in the graph.
+    v1 = unique_string(desp='v1')
+
     cnts_distribution = m4_ifdef(<!__POSTGRESQL__!>, <!''!>,
-        <!"DISTRIBUTED BY ({0})".format(vertex_id)!>)
+            <!"DISTRIBUTED BY ({0})".format(vertex_id)!>)
+    cur_join_clause = """{edge_temp_table}.{dest}={cur}.{vertex_id}
+        """.format(**locals())
+    out_cnts_join_clause = """{out_cnts}.{vertex_id}={edge_temp_table}.{src}
+        """.format(**locals())
+    v1_join_clause = """{v1}.{vertex_id}={edge_temp_table}.{src}
+        """.format(**locals())
 
-    plpy.execute("""
-        DROP TABLE IF EXISTS {out_cnts};
-        CREATE TEMP TABLE {out_cnts} AS
-        SELECT {src} AS {vertex_id}, COUNT({dest}) AS {out_cnts_cnt}
-        FROM {edge_table}
-        GROUP BY {src}
-        {cnts_distribution}
-        """.format(**locals()))
+    random_probability = (1.0-damping_factor)/nvertices
+    ######################################################################
+    # Create several strings that will be used to construct required
+    # queries. These strings will be required only during grouping.
+    random_jump_prob = random_probability
+    ignore_group_clause_first = ''
+    limit = ' LIMIT 1 '
+
+    grouping_cols_select_pr = ''
+    vertices_per_group_inner_join_pr = ''
+    ignore_group_clause_pr= ''
+
+    grouping_cols_select_ins = ''
+    vpg_from_clause_ins = ''
+    vpg_where_clause_ins = ''
+    message_grp_where_ins = ''
+    ignore_group_clause_ins = ''
+
+    grouping_cols_select_conv = '{0}.{1}'.format(cur, vertex_id)
+    group_by_grouping_cols_conv = ''
+    message_grp_clause_conv = ''
+    ignore_group_clause_conv = ''
+    ######################################################################
+
+    # Queries when groups are involved need a lot more conditions in
+    # various clauses, so populating the required variables. Some intermediate
+    # tables are unnecessary when no grouping is involved, so create some
+    # tables and certain columns only during grouping.
+    if grouping_cols:
+        distinct_grp_table = unique_string(desp='grp')
+        plpy.execute("DROP TABLE IF EXISTS {0}".format(distinct_grp_table))
+        plpy.execute("""CREATE TEMP TABLE {distinct_grp_table} AS
+                SELECT DISTINCT {grouping_cols} FROM {edge_table}
+            """.format(**locals()))
+        vertices_per_group = unique_string(desp='nvert_grp')
+        init_pr = unique_string(desp='init')
+        random_prob = unique_string(desp='rand')
+        subq = unique_string(desp='subquery')
+        rand_damp = 1-damping_factor
+        grouping_where_clause = ' AND '.join(
+            [distinct_grp_table+'.'+col+'='+subq+'.'+col
+            for col in grouping_cols_list])
+        group_by_clause = ', '.join([distinct_grp_table+'.'+col
+            for col in grouping_cols_list])
+        # Find number of vertices in each group, this is the normalizing factor
+        # for computing the random_prob
+        plpy.execute("DROP TABLE IF EXISTS {0}".format(vertices_per_group))
+        plpy.execute("""CREATE TEMP TABLE {vertices_per_group} AS
+                SELECT {distinct_grp_table}.*,
+                1/COUNT(__vertices__)::DOUBLE PRECISION AS {init_pr},
+                {rand_damp}/COUNT(__vertices__)::DOUBLE PRECISION AS 
{random_prob}
+                FROM {distinct_grp_table} INNER JOIN (
+                    SELECT {grouping_cols}, {src} AS __vertices__
+                    FROM {edge_table}
+                    UNION
+                    SELECT {grouping_cols}, {dest} FROM {edge_table}
+                ){subq}
+                ON {grouping_where_clause}
+                GROUP BY {group_by_clause}
+            """.format(**locals()))
+
+        grouping_where_clause = ' AND '.join(
+            [vertices_per_group+'.'+col+'='+subq+'.'+col
+            for col in grouping_cols_list])
+        group_by_clause = ', '.join([vertices_per_group+'.'+col
+            for col in grouping_cols_list])
+        plpy.execute("""
+                CREATE TEMP TABLE {cur} AS
+                SELECT {group_by_clause}, {subq}.__vertices__ as {vertex_id},
+                       {init_pr} AS pagerank
+                FROM {vertices_per_group} INNER JOIN (
+                    SELECT {grouping_cols}, {src} AS __vertices__
+                    FROM {edge_table}
+                    UNION
+                    SELECT {grouping_cols}, {dest} FROM {edge_table}
+                ){subq}
+                ON {grouping_where_clause}
+            """.format(**locals()))
+        vpg = unique_string(desp='vpg')
+        # Compute the out-degree of every node in the group-based subgraphs.
+        plpy.execute("DROP TABLE IF EXISTS {0}".format(out_cnts))
+        plpy.execute("""CREATE TEMP TABLE {out_cnts} AS
+            SELECT {grouping_cols_select} {src} AS {vertex_id},
+                   COUNT({dest}) AS {out_cnts_cnt}
+            FROM {edge_table}
+            GROUP BY {grouping_cols_select} {src}
+            {cnts_distribution}
+            """.format(grouping_cols_select=grouping_cols+','
+                if grouping_cols else '', **locals()))
+
+        message_grp = ' AND '.join(
+            ["{cur}.{col}={message}.{col}".format(**locals())
+                for col in grouping_cols_list])
+        cur_join_clause = cur_join_clause + ' AND ' + ' AND '.join(
+            ["{edge_temp_table}.{col}={cur}.{col}".format(**locals())
+                for col in grouping_cols_list])
+        out_cnts_join_clause = out_cnts_join_clause + ' AND ' + ' AND '.join(
+            ["{edge_temp_table}.{col}={out_cnts}.{col}".format(**locals())
+                for col in grouping_cols_list])
+        v1_join_clause = v1_join_clause + ' AND ' + ' AND '.join(
+            ["{edge_temp_table}.{col}={v1}.{col}".format(**locals())
+                for col in grouping_cols_list])
+        vpg_join_clause = ' AND '.join(
+            ["{edge_temp_table}.{col}={vpg}.{col}".format(**locals())
+                for col in grouping_cols_list])
+        vpg_cur_join_clause = ' AND '.join(
+            ["{cur}.{col}={vpg}.{col}".format(**locals())
+                for col in grouping_cols_list])
+        # join clause specific to populating random_prob for nodes without any
+        # incoming edges.
+        edge_grouping_cols_select = ', '.join(
+            ["{edge_temp_table}.{col}".format(**locals())
+                for col in grouping_cols_list])
+        cur_grouping_cols_select = ', '.join(
+            ["{cur}.{col}".format(**locals()) for col in grouping_cols_list])
+        # Create output summary table:
+        cols_names_types = get_cols_and_types(edge_table)
+        grouping_cols_clause = ', '.join([c_name+" "+c_type
+            for (c_name, c_type) in cols_names_types
+            if c_name in grouping_cols_list])
+        plpy.execute("""
+                CREATE TABLE {summary_table} (
+                    {grouping_cols_clause},
+                    __iterations__ INTEGER
+                )
+            """.format(**locals()))
+        # Create output table. This will be updated whenever a group converges
+        # Note that vertex_id is assumed to be an integer (as described in
+        # documentation)
+        plpy.execute("""
+                CREATE TABLE {out_table} (
+                    {grouping_cols_clause},
+                    {vertex_id} INTEGER,
+                    pagerank DOUBLE PRECISION
+                )
+            """.format(**locals()))
+        temp_summary_table = unique_string(desp='temp_summary')
+        plpy.execute("DROP TABLE IF EXISTS {0}".format(temp_summary_table))
+        plpy.execute("""
+                CREATE TABLE {temp_summary_table} (
+                    {grouping_cols_clause}
+                )
+            """.format(**locals()))
+        ######################################################################
+        # Strings required for the main PageRank computation query
+        grouping_cols_select_pr = edge_grouping_cols_select+', '
+        random_jump_prob = 'MIN({vpg}.{random_prob})'.format(**locals())
+        vertices_per_group_inner_join_pr = """INNER JOIN {vertices_per_group}
+            AS {vpg} ON {vpg_join_clause}""".format(**locals())
+        ignore_group_clause_pr=' WHERE '+get_ignore_groups(summary_table,
+            edge_temp_table, grouping_cols_list)
+        # Strings required for updating PageRank scores of vertices that have
+        # no incoming edges
+        grouping_cols_select_ins = cur_grouping_cols_select+','
+        vpg_from_clause_ins = ', {vertices_per_group} AS {vpg}'.format(
+            **locals())
+        vpg_where_clause_ins = '{vpg_cur_join_clause} AND '.format(
+            **locals())
+        message_grp_where_ins = 'WHERE {message_grp}'.format(**locals())
+        ignore_group_clause_ins = ' AND '+get_ignore_groups(summary_table,
+            cur, grouping_cols_list)
+        # Strings required for convergence test query
+        grouping_cols_select_conv = cur_grouping_cols_select
+        group_by_grouping_cols_conv = ' GROUP BY {0}'.format(
+            cur_grouping_cols_select)
+        message_grp_clause_conv = '{0} AND '.format(message_grp)
+        ignore_group_clause_conv = ' AND '+get_ignore_groups(summary_table,
+            cur, grouping_cols_list)
+        limit = ''
+    else:
+        # cur and out_cnts tables can be simpler when no grouping is involved.
+        init_value = 1.0/nvertices
+        plpy.execute("""
+                CREATE TEMP TABLE {cur} AS
+                SELECT {vertex_id}, {init_value}::DOUBLE PRECISION AS pagerank
+                FROM {vertex_table}
+            """.format(**locals()))
 
-    for i in range(max_iter):
+        # Compute the out-degree of every node in the graph.
+        plpy.execute("DROP TABLE IF EXISTS {0}".format(out_cnts))
+        plpy.execute("""CREATE TEMP TABLE {out_cnts} AS
+            SELECT {src} AS {vertex_id}, COUNT({dest}) AS {out_cnts_cnt}
+            FROM {edge_table}
+            GROUP BY {src}
+            {cnts_distribution}
+            """.format(**locals()))
+
+        # The summary table when there is no grouping will contain only
+        # the iteration column. We don't need to create the out_table
+        # when no grouping is used since the 'cur' table will be renamed
+        # to out_table after pagerank computation is completed.
+        plpy.execute("""
+                CREATE TABLE {summary_table} (
+                    __iterations__ INTEGER
+                )
+            """.format(**locals()))
+    unconverged = 0
+    iteration_num = 0
+    for iteration_num in range(max_iter):
         #####################################################################
         # PageRank for node 'A' at any given iteration 'i' is given by:
-        # PR_i(A) = damping_factor(PR_i-1(B)/degree(B) + PR_i-1(C)/degree(C) + 
...) + (1-damping_factor)/N
+        # PR_i(A) = damping_factor(PR_i-1(B)/degree(B) +
+        #           PR_i-1(C)/degree(C) + ...) + (1-damping_factor)/N
         # where 'N' is the number of vertices in the graph,
         # B, C are nodes that have edges to node A, and
         # degree(node) represents the number of outgoing edges from 'node'
@@ -157,45 +373,183 @@ def pagerank(schema_madlib, vertex_table, vertex_id, 
edge_table, edge_args,
         # More information can be found at:
         # https://en.wikipedia.org/wiki/PageRank#Damping_factor
 
-        # The query below computes the PageRank of each node using the above 
formula.
+        # The query below computes the PageRank of each node using the above
+        # formula. A small explanatory note on ignore_group_clause:
+        # This is used only when grouping is set. This essentially will have
+        # the condition that will help skip the PageRank computation on groups
+        # that have converged.
         plpy.execute("""
                 CREATE TABLE {message} AS
-                SELECT {edge_temp_table}.{dest} AS {vertex_id},
-                        
SUM({v1}.pagerank/{out_cnts}.{out_cnts_cnt})*{damping_factor}+{random_prob} AS 
pagerank
+                SELECT {grouping_cols_select_pr} {edge_temp_table}.{dest} AS 
{vertex_id},
+                        
SUM({v1}.pagerank/{out_cnts}.{out_cnts_cnt})*{damping_factor}+{random_jump_prob}
 AS pagerank
                 FROM {edge_temp_table}
-                    INNER JOIN {cur} ON 
{edge_temp_table}.{dest}={cur}.{vertex_id}
-                    INNER JOIN {out_cnts} ON 
{out_cnts}.{vertex_id}={edge_temp_table}.{src}
-                    INNER JOIN {cur} AS {v1} ON 
{v1}.{vertex_id}={edge_temp_table}.{src}
-                GROUP BY {edge_temp_table}.{dest}
-            """.format(**locals()))
-        # If there are nodes that have no incoming edges, they are not 
captured in the message table.
-        # Insert entries for such nodes, with random_prob.
+                    INNER JOIN {cur} ON {cur_join_clause}
+                    INNER JOIN {out_cnts} ON {out_cnts_join_clause}
+                    INNER JOIN {cur} AS {v1} ON {v1_join_clause}
+                    {vertices_per_group_inner_join_pr}
+                {ignore_group_clause}
+                GROUP BY {grouping_cols_select_pr} {edge_temp_table}.{dest}
+            """.format(ignore_group_clause=ignore_group_clause_pr
+                    if iteration_num>0 else ignore_group_clause_first,
+                **locals()))
+        # If there are nodes that have no incoming edges, they are not
+        # captured in the message table. Insert entries for such nodes,
+        # with random_prob.
         plpy.execute("""
                 INSERT INTO {message}
-                SELECT {vertex_id}, {random_prob}::DOUBLE PRECISION AS pagerank
-                FROM {cur}
-                WHERE {vertex_id} NOT IN (
+                SELECT {grouping_cols_select_ins} {cur}.{vertex_id},
+                    {random_jump_prob} AS pagerank
+                FROM {cur} {vpg_from_clause_ins}
+                WHERE {vpg_where_clause_ins} {vertex_id} NOT IN (
                     SELECT {vertex_id}
                     FROM {message}
+                    {message_grp_where_ins}
                 )
-            """.format(**locals()))
-        # Check for convergence will be done as part of grouping support for 
pagerank:
-        # https://issues.apache.org/jira/browse/MADLIB-1082. So, the threshold 
parameter
-        # is a dummy variable at the moment, the PageRank computation happens 
for
-        # {max_iter} number of times.
+                {ignore_group_clause}
+                GROUP BY {grouping_cols_select_ins} {cur}.{vertex_id}
+            """.format(ignore_group_clause=ignore_group_clause_ins
+                    if iteration_num>0 else ignore_group_clause_first,
+                **locals()))
+
+        # Check for convergence:
+        ## Check for convergence only if threshold != 0.
+        if threshold != 0:
+            # message_unconv and cur_unconv will contain the unconverged groups
+            # after current # and previous iterations respectively. Groups that
+            # are missing in message_unconv but appear in cur_unconv are the
+            # groups that have converged after this iteration's computations.
+            # If no grouping columns are specified, then we check if there is
+            # at least one unconverged node (limit 1 is used in the query).
+            plpy.execute("""
+                    CREATE TEMP TABLE {message_unconv} AS
+                    SELECT {grouping_cols_select_conv}
+                    FROM {message}
+                    INNER JOIN {cur}
+                    ON {cur}.{vertex_id}={message}.{vertex_id}
+                    WHERE {message_grp_clause_conv}
+                        ABS({cur}.pagerank-{message}.pagerank) > {threshold}
+                    {ignore_group_clause}
+                    {group_by_grouping_cols_conv}
+                    {limit}
+                """.format(ignore_group_clause=ignore_group_clause_ins
+                        if iteration_num>0 else ignore_group_clause_conv,
+                    **locals()))
+            unconverged = plpy.execute("""SELECT COUNT(*) AS cnt FROM {0}
+                """.format(message_unconv))[0]["cnt"]
+            if iteration_num > 0 and grouping_cols:
+                # Update result and summary tables for groups that have
+                # converged
+                # since the last iteration.
+                update_result_tables(temp_summary_table, iteration_num,
+                    summary_table, out_table, message, grouping_cols_list,
+                    cur_unconv, message_unconv)
+            plpy.execute("DROP TABLE IF EXISTS {0}".format(cur_unconv))
+            plpy.execute("""ALTER TABLE {message_unconv} RENAME TO
+                {cur_unconv} """.format(**locals()))
+        else:
+            # Do not run convergence test if threshold=0, since that implies
+            # the user wants to run max_iter iterations.
+            unconverged = 1
+        plpy.execute("DROP TABLE IF EXISTS {0}".format(cur))
+        plpy.execute("""ALTER TABLE {message} RENAME TO {cur}
+                """.format(**locals()))
+        if unconverged == 0:
+            break
+
+    # If there still are some unconverged groups/(entire table),
+    # update results.
+    if grouping_cols:
+        if unconverged > 0:
+            if threshold != 0:
+                # We completed max_iters, but there are still some unconverged
+                # groups # Update the result and summary tables for unconverged
+                # groups.
+                update_result_tables(temp_summary_table, iteration_num,
+                    summary_table, out_table, cur, grouping_cols_list,
+                    cur_unconv)
+            else:
+                # No group has converged. List of all group values are in
+                # distinct_grp_table.
+                update_result_tables(temp_summary_table, iteration_num,
+                    summary_table, out_table, cur, grouping_cols_list,
+                    distinct_grp_table)
+    else:
+        plpy.execute("""ALTER TABLE {table_name} RENAME TO {out_table}
+            """.format(table_name=cur, **locals()))
         plpy.execute("""
-                DROP TABLE IF EXISTS {cur};
-                ALTER TABLE {message} RENAME TO {cur}
+                INSERT INTO {summary_table} VALUES
+                ({iteration_num}+1);
             """.format(**locals()))
 
-    plpy.execute("ALTER TABLE {cur} RENAME TO {out_table}".format(**locals()))
-
     ## Step 4: Cleanup
-    plpy.execute("""
-        DROP TABLE IF EXISTS {0},{1},{2},{3};
-        """.format(out_cnts, edge_temp_table, cur, message))
+    plpy.execute("""DROP TABLE IF EXISTS {0},{1},{2},{3},{4},{5};
+        """.format(out_cnts, edge_temp_table, cur, message, cur_unconv,
+                    message_unconv))
+    if grouping_cols:
+        plpy.execute("""DROP TABLE IF EXISTS {0},{1},{2};
+            """.format(vertices_per_group, temp_summary_table,
+                distinct_grp_table))
     plpy.execute("SET client_min_messages TO %s" % old_msg_level)
 
+def update_result_tables(temp_summary_table, i, summary_table, out_table,
+    res_table, grouping_cols_list, cur_unconv, message_unconv=None):
+    """
+        This function updates the summary and output tables only for those
+        groups that have converged. This is found out by looking at groups
+        that appear in cur_unvonv but not in message_unconv: message_unconv
+        consists of groups that have not converged in the current iteration,
+        while cur_unconv contains groups that had not converged in the
+        previous iterations. The entries in cur_unconv is a superset of the
+        entries in message_unconv. So the difference in the groups across
+        the two tables represents the groups that converged in the current
+        iteration.
+    """
+    plpy.execute("TRUNCATE TABLE {0}".format(temp_summary_table))
+    if message_unconv is None:
+        # If this function is called after max_iter is completed, without
+        # convergence, all the unconverged groups from cur_unconv is used
+        # (note that message_unconv is renamed to cur_unconv before checking
+        # for unconverged==0 in the pagerank function's for loop)
+        plpy.execute("""
+            INSERT INTO {temp_summary_table}
+            SELECT * FROM {cur_unconv}
+            """.format(**locals()))
+    else:
+        plpy.execute("""
+            INSERT INTO {temp_summary_table}
+            SELECT {cur_unconv}.*
+            FROM {cur_unconv}
+            WHERE {join_condition}
+            """.format(join_condition=get_ignore_groups(
+                message_unconv, cur_unconv, grouping_cols_list), **locals()))
+    plpy.execute("""
+        INSERT INTO {summary_table}
+        SELECT *, {i}+1 AS __iteration__
+        FROM {temp_summary_table}
+        """.format(**locals()))
+    plpy.execute("""
+        INSERT INTO {out_table}
+        SELECT {res_table}.*
+        FROM {res_table}
+        INNER JOIN {temp_summary_table}
+        ON {join_condition}
+        """.format(join_condition=' AND '.join(
+                ["{res_table}.{col}={temp_summary_table}.{col}".format(
+                    **locals())
+                for col in grouping_cols_list]), **locals()))
+
+def get_ignore_groups(first_table, second_table, grouping_cols_list):
+    """
+        This function generates the necessary clause to only select the
+        groups that appear in second_table and not in first_table.
+    """
+    return """({second_table_cols}) NOT IN (SELECT {grouping_cols} FROM
+    {first_table}) """.format(second_table_cols=', '.join(
+            ["{second_table}.{col}".format(**locals())
+            for col in grouping_cols_list]),
+        grouping_cols=', '.join([col for col in grouping_cols_list]),
+        **locals())
+
 def pagerank_help(schema_madlib, message, **kwargs):
     """
     Help function for pagerank
@@ -212,12 +566,20 @@ def pagerank_help(schema_madlib, message, **kwargs):
             message.lower() in ("usage", "help", "?"):
         help_string = "Get from method below"
         help_string = get_graph_usage(schema_madlib, 'PageRank',
-            """out_table       TEXT,  -- Name of the output table for PageRank
-    damping_factor, DOUBLE PRECISION, -- Damping factor in random surfer model
-                                      -- (DEFAULT = 0.85)
-    max_iter,       INTEGER,          -- Maximum iteration number (DEFAULT = 
100)
-    threshold       DOUBLE PRECISION  -- Stopping criteria (DEFAULT = 1e-5)
-""")
+            """out_table     TEXT, -- Name of the output table for PageRank
+    damping_factor DOUBLE PRECISION, -- Damping factor in random surfer model
+                                     -- (DEFAULT = 0.85)
+    max_iter      INTEGER, -- Maximum iteration number (DEFAULT = 100)
+    threshold     DOUBLE PRECISION, -- Stopping criteria (DEFAULT = 1/(N*100),
+                                    -- N is number of vertices in the graph)
+    grouping_col  TEXT -- Comma separated column names to group on
+                       -- (DEFAULT = NULL, no grouping)
+""") + """
+
+A summary table is also created that contains information regarding the
+number of iterations required for convergence. It is named by adding the
+suffix '_summary' to the 'out_table' parameter.
+"""
     else:
         if message is not None and \
                 message.lower() in ("example", "examples"):
@@ -232,7 +594,8 @@ CREATE TABLE vertex(
         );
 CREATE TABLE edge(
         src INTEGER,
-        dest INTEGER
+        dest INTEGER,
+        user_id INTEGER
         );
 INSERT INTO vertex VALUES
 (0),
@@ -243,30 +606,62 @@ INSERT INTO vertex VALUES
 (5),
 (6);
 INSERT INTO edge VALUES
-(0, 1),
-(0, 2),
-(0, 4),
-(1, 2),
-(1, 3),
-(2, 3),
-(2, 5),
-(2, 6),
-(3, 0),
-(4, 0),
-(5, 6),
-(6, 3);
+(0, 1, 1),
+(0, 2, 1),
+(0, 4, 1),
+(1, 2, 1),
+(1, 3, 1),
+(2, 3, 1),
+(2, 5, 1),
+(2, 6, 1),
+(3, 0, 1),
+(4, 0, 1),
+(5, 6, 1),
+(6, 3, 1),
+(0, 1, 2),
+(0, 2, 2),
+(0, 4, 2),
+(1, 2, 2),
+(1, 3, 2),
+(2, 3, 2),
+(3, 0, 2),
+(4, 0, 2),
+(5, 6, 2),
+(6, 3, 2);
 
 -- Compute the PageRank:
-DROP TABLE IF EXISTS pagerank_out;
+DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;
+SELECT madlib.pagerank(
+             'vertex',             -- Vertex table
+             'id',                 -- Vertix id column
+             'edge',               -- Edge table
+             'src=src, dest=dest', -- Comma delimted string of edge arguments
+             'pagerank_out');      -- Output table of PageRank
+
+-- View the PageRank of all vertices, sorted by their scores.
+SELECT * FROM pagerank_out ORDER BY pagerank DESC;
+-- View the summary table to find the number of iterations required for
+-- convergence.
+SELECT * FROM pagerank_out_summary;
+
+-- Compute PageRank of nodes associated with each user:
+DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;
 SELECT madlib.pagerank(
              'vertex',             -- Vertex table
              'id',                 -- Vertix id column
              'edge',               -- Edge table
              'src=src, dest=dest', -- Comma delimted string of edge arguments
-             'pagerank_out')       -- Output table of PageRank
+             'pagerank_out',       -- Output table of PageRank
+             NULL,                 -- Default damping factor
+             NULL,                 -- Default max_iter
+             0.00000001,           -- Threshold
+             'user_id');           -- Grouping column
 
 -- View the PageRank of all vertices, sorted by their scores.
-SELECT * FROM pagerank_out ORDER BY pagerank desc;
+SELECT * FROM pagerank_out ORDER BY user_id, pagerank DESC;
+-- View the summary table to find the number of iterations required for
+-- convergence for each group.
+SELECT * FROM pagerank_out_summary;
 """
         else:
             help_string = """

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c6948930/src/ports/postgres/modules/graph/pagerank.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/pagerank.sql_in 
b/src/ports/postgres/modules/graph/pagerank.sql_in
index 712d146..6531bb5 100644
--- a/src/ports/postgres/modules/graph/pagerank.sql_in
+++ b/src/ports/postgres/modules/graph/pagerank.sql_in
@@ -58,7 +58,8 @@ pagerank( vertex_table,
             out_table,
             damping_factor,
             max_iter,
-            threshold
+            threshold,
+            grouping_cols
           )
 </pre>
 
@@ -91,7 +92,13 @@ this string argument:
 It will contain a row for every vertex from 'vertex_table' with
 the following columns:
   - vertex_id : The id of a vertex. Will use the input parameter 'vertex_id' 
for column naming.
-  - pagerank : The vertex's PageRank.</dd>
+  - pagerank : The vertex's PageRank.
+  - grouping_cols : Grouping column (if any) values associated with the 
vertex_id.</dd>
+
+A summary table is also created that contains information 
+regarding the number of iterations required for convergence.
+It is named by adding the suffix '_summary' to the 'out_table' 
+parameter.
 
 <dt>damping_factor</dt>
 <dd>FLOAT8, default 0.85. The probability, at any step, that a user will 
continue following the links in a random surfer model.</dd>
@@ -100,9 +107,18 @@ the following columns:
 <dd>INTEGER, default: 100. The maximum number of iterations allowed.</dd>
 
 <dt>threshold</dt>
-<dd>FLOAT8, default: 1e-5. If the difference between the PageRank of every 
vertex of two consecutive
+<dd>FLOAT8, default: (1/number of vertices * 100). If the difference between 
the PageRank of every vertex of two consecutive
 iterations is smaller than 'threshold', or the iteration number is larger than 
'max_iter', the
-computation stops.  If you set the threshold to zero, then you will force the 
algorithm to run for the full number of iterations specified in 'max_iter'.</dd>
+computation stops.  If you set the threshold to zero, then you will force the 
algorithm to run for the full number of iterations specified in 'max_iter'.
+It is advisable to set threshold to a value lower than 1/(number of vertices 
in the graph) since the PageRank value of nodes is initialized to that
+value.</dd>
+
+<dt>grouping_cols (optional)</dt>
+<dd>TEXT, default: NULL. A single column or a list of comma-separated
+columns that divides the input data into discrete groups, resulting in one
+distribution per group. When this value is NULL, no grouping is used and
+a single model is generated for all data.
+@note Expressions are not currently supported for 'grouping_cols'.</dd>
 
 </dl>
 
@@ -122,7 +138,8 @@ CREATE TABLE vertex(
         );
 CREATE TABLE edge(
         src INTEGER,
-        dest INTEGER
+        dest INTEGER,
+        user_id INTEGER
         );
 INSERT INTO vertex VALUES
 (0),
@@ -133,47 +150,66 @@ INSERT INTO vertex VALUES
 (5),
 (6);
 INSERT INTO edge VALUES
-(0, 1),
-(0, 2),
-(0, 4),
-(1, 2),
-(1, 3),
-(2, 3),
-(2, 5),
-(2, 6),
-(3, 0),
-(4, 0),
-(5, 6),
-(6, 3);
+(0, 1, 1),
+(0, 2, 1),
+(0, 4, 1),
+(1, 2, 1),
+(1, 3, 1),
+(2, 3, 1),
+(2, 5, 1),
+(2, 6, 1),
+(3, 0, 1),
+(4, 0, 1),
+(5, 6, 1),
+(6, 3, 1),
+(0, 1, 2),
+(0, 2, 2),
+(0, 4, 2),
+(1, 2, 2),
+(1, 3, 2),
+(2, 3, 2),
+(3, 0, 2),
+(4, 0, 2),
+(5, 6, 2),
+(6, 3, 2);
 </pre>
 
 -# Compute the PageRank:
 <pre class="syntax">
-DROP TABLE IF EXISTS pagerank_out;
+DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;
 SELECT madlib.pagerank(
                          'vertex',             -- Vertex table
                          'id',                 -- Vertix id column
                          'edge',               -- Edge table
                          'src=src, dest=dest', -- Comma delimted string of 
edge arguments
                          'pagerank_out');      -- Output table of PageRank
-SELECT * FROM pagerank_out ORDER BY pagerank desc;
+SELECT * FROM pagerank_out ORDER BY pagerank DESC;
 </pre>
 <pre class="result">
  id |      pagerank
-----+--------------------
-  0 |  0.278256122055856
-  3 |  0.201882680839737
-  2 |  0.142878491945534
-  6 |  0.114538731993905
-  1 |  0.100266150276761
-  4 |  0.100266150276761
-  5 |  0.061911672611445
+----+-------------------
+  0 |  0.28753749341184
+  3 |  0.21016988901855
+  2 |  0.14662683454062
+  4 |  0.10289614384217
+  1 |  0.10289614384217
+  6 |  0.09728637768887
+  5 |  0.05258711765692
 (7 rows)
 </pre>
+<pre class="syntax">
+SELECT * FROM pagerank_out_summary;
+</pre>
+<pre class="result">
+ __iterations__
+ ----------------+
+             16
+(1 row)
+</pre>
 
--# Run PageRank with a damping factor of 0.5 results in different final values:
+-# Running PageRank with a damping factor of 0.5 results in different final 
values:
 <pre class="syntax">
-DROP TABLE IF EXISTS pagerank_out;
+DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;
 SELECT madlib.pagerank(
                          'vertex',             -- Vertex table
                          'id',                 -- Vertix id column
@@ -181,21 +217,67 @@ SELECT madlib.pagerank(
                          'src=src, dest=dest', -- Comma delimted string of 
edge arguments
                          'pagerank_out',       -- Output table of PageRank
                          0.5);                 -- Damping factor
-SELECT * FROM pagerank_out ORDER BY pagerank desc;
+SELECT * FROM pagerank_out ORDER BY pagerank DESC;
 </pre>
 <pre class="result">
- id |     pagerank      
-----+-------------------
-  0 | 0.221378135793372
-  3 | 0.191574922960784
-  6 | 0.140994575864846
-  2 | 0.135406336658892
-  4 | 0.108324751971412
-  1 | 0.108324751971412
-  5 | 0.093996524779681
+ id |      pagerank      
+----+--------------------
+  0 |  0.225477161441199
+  3 |  0.199090328586664
+  2 |  0.136261327206477
+  6 |  0.132691559968224
+  4 |  0.109009291409508
+  1 |  0.109009291409508
+  5 | 0.0884610399788161
 (7 rows)
 </pre>
 
+-# Now compute the PageRank of vertices associated with each user
+using the grouping feature:
+<pre class="syntax">
+DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;
+SELECT madlib.pagerank(
+                         'vertex',             -- Vertex table
+                         'id',                 -- Vertix id column
+                         'edge',               -- Edge table
+                         'src=src, dest=dest', -- Comma delimted string of 
edge arguments
+                         'pagerank_out',       -- Output table of PageRank
+                         NULL,                 -- Default damping factor (0.85)
+                         NULL,                 -- Default max iters (100)
+                         0.00000001,           -- Threshold
+                         'user_id');           -- Grouping column name
+SELECT * FROM pagerank_out ORDER BY user_id, pagerank DESC;
+</pre>
+<pre class="result">
+ user_id | id |      pagerank
+---------+----+--------------------
+       1 |  0 |  0.27825488388552
+       1 |  3 |  0.20188114667075
+       1 |  2 |  0.14288112346059
+       1 |  6 |  0.11453637832147
+       1 |  1 |  0.10026745615438
+       1 |  4 |  0.10026745615438
+       1 |  5 |  0.06191155535288
+       2 |  0 |  0.31854625004173
+       2 |  3 |  0.23786686773343
+       2 |  2 |  0.15914876489397
+       2 |  1 |  0.11168334437971
+       2 |  4 |  0.11168334437971
+       2 |  6 |  0.03964285714285
+       2 |  5 |  0.02142857142857
+(14 rows)
+</pre>
+<pre class="syntax">
+SELECT * FROM pagerank_out_summary ORDER BY user_id;
+</pre>
+<pre class="result">
+ user_id | __iterations__
+---------+----------------
+       1 |             27
+       2 |             31
+(2 rows)
+</pre>
+
 @anchor literature
 @par Literature
 
@@ -210,7 +292,8 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
     out_table       TEXT,
     damping_factor  FLOAT8,
     max_iter        INTEGER,
-    threshold       FLOAT8
+    threshold       FLOAT8,
+    grouping_cols   VARCHAR
 ) RETURNS VOID AS $$
     PythonFunction(graph, pagerank, pagerank)
 $$ LANGUAGE plpythonu VOLATILE
@@ -223,9 +306,23 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
     edge_args       TEXT,
     out_table       TEXT,
     damping_factor  FLOAT8,
+    max_iter        INTEGER,
+    threshold       FLOAT8
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, $6, $7, $8, NULL)
+$$ LANGUAGE SQL
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+-------------------------------------------------------------------------
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
+    vertex_table    TEXT,
+    vertex_id       TEXT,
+    edge_table      TEXT,
+    edge_args       TEXT,
+    out_table       TEXT,
+    damping_factor  FLOAT8,
     max_iter        INTEGER
 ) RETURNS VOID AS $$
-    SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, $6, $7, 0.00001)
+    SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, $6, $7, 0.00001, NULL)
 $$ LANGUAGE SQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 -------------------------------------------------------------------------
@@ -237,7 +334,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
     out_table       TEXT,
     damping_factor  FLOAT8
 ) RETURNS VOID AS $$
-    SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, $6, 100, 0.00001)
+    SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, $6, 100, 0.00001, NULL)
 $$ LANGUAGE SQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 -------------------------------------------------------------------------
@@ -248,7 +345,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
     edge_args       TEXT,
     out_table       TEXT
 ) RETURNS VOID AS $$
-    SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, 0.85, 100, 0.00001)
+    SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, 0.85, 100, 0.00001, NULL)
 $$ LANGUAGE SQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 -------------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c6948930/src/ports/postgres/modules/graph/test/pagerank.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/test/pagerank.sql_in 
b/src/ports/postgres/modules/graph/test/pagerank.sql_in
index 1d695e2..2e84f35 100644
--- a/src/ports/postgres/modules/graph/test/pagerank.sql_in
+++ b/src/ports/postgres/modules/graph/test/pagerank.sql_in
@@ -19,13 +19,14 @@
  *
  *//* ----------------------------------------------------------------------- 
*/
 
-DROP TABLE IF EXISTS vertex, edge, pagerank_out;
+DROP TABLE IF EXISTS vertex, edge;
 CREATE TABLE vertex(
         id INTEGER
         );
 CREATE TABLE edge(
         src INTEGER,
-        dest INTEGER
+        dest INTEGER,
+        user_id INTEGER
         );
 INSERT INTO vertex VALUES
 (0),
@@ -36,19 +37,30 @@ INSERT INTO vertex VALUES
 (5),
 (6);
 INSERT INTO edge VALUES
-(0, 1),
-(0, 2),
-(0, 4),
-(1, 2),
-(1, 3),
-(2, 3),
-(2, 5),
-(2, 6),
-(3, 0),
-(4, 0),
-(5, 6),
-(6, 3);
+(0, 1, 1),
+(0, 2, 1),
+(0, 4, 1),
+(1, 2, 1),
+(1, 3, 1),
+(2, 3, 1),
+(2, 5, 1),
+(2, 6, 1),
+(3, 0, 1),
+(4, 0, 1),
+(5, 6, 1),
+(6, 3, 1),
+(0, 1, 2),
+(0, 2, 2),
+(0, 4, 2),
+(1, 2, 2),
+(1, 3, 2),
+(2, 3, 2),
+(3, 0, 2),
+(4, 0, 2),
+(5, 6, 2),
+(6, 3, 2);
 
+DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;
 SELECT madlib.pagerank(
              'vertex',        -- Vertex table
              'id',            -- Vertix id column
@@ -60,3 +72,26 @@ SELECT madlib.pagerank(
 SELECT assert(relative_error(SUM(pagerank), 1) < 0.00001,
         'PageRank: Scores do not sum up to 1.'
     ) FROM pagerank_out;
+
+DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;
+SELECT madlib.pagerank(
+             'vertex',        -- Vertex table
+             'id',            -- Vertix id column
+             'edge',          -- Edge table
+             'src=src, dest=dest', -- Edge args
+             'pagerank_out', -- Output table of PageRank
+             NULL,
+             NULL,
+             0.00000001,
+             'user_id');
+
+-- View the PageRank of all vertices, sorted by their scores.
+SELECT assert(relative_error(SUM(pagerank), 1) < 0.00001,
+        'PageRank: Scores do not sum up to 1 for group 1.'
+    ) FROM pagerank_out WHERE user_id=1;
+SELECT assert(relative_error(__iterations__, 27) = 0,
+        'PageRank: Incorrect iterations for group 1.'
+    ) FROM pagerank_out_summary WHERE user_id=1;
+SELECT assert(relative_error(__iterations__, 31) = 0,
+        'PageRank: Incorrect iterations for group 2.'
+    ) FROM pagerank_out_summary WHERE user_id=2;

Reply via email to