Added: datafu/site/docs/spark/2.0.0/datafu/spark/SparkDFUtils$.html URL: http://svn.apache.org/viewvc/datafu/site/docs/spark/2.0.0/datafu/spark/SparkDFUtils%24.html?rev=1915275&view=auto ============================================================================== --- datafu/site/docs/spark/2.0.0/datafu/spark/SparkDFUtils$.html (added) +++ datafu/site/docs/spark/2.0.0/datafu/spark/SparkDFUtils$.html Tue Jan 16 19:56:02 2024 @@ -0,0 +1,955 @@ +<!DOCTYPE html > +<html> + <head> + <meta http-equiv="X-UA-Compatible" content="IE=edge" /> + <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no" /> + <title>datafu-spark 2.0.0 API - datafu.spark.SparkDFUtils</title> + <meta name="description" content="datafu - spark 2.0.0 API - datafu.spark.SparkDFUtils" /> + <meta name="keywords" content="datafu spark 2.0.0 API datafu.spark.SparkDFUtils" /> + <meta http-equiv="content-type" content="text/html; charset=UTF-8" /> + + + <link href="../../lib/index.css" media="screen" type="text/css" rel="stylesheet" /> + <link href="../../lib/template.css" media="screen" type="text/css" rel="stylesheet" /> + <link href="../../lib/diagrams.css" media="screen" type="text/css" rel="stylesheet" id="diagrams-css" /> + <script type="text/javascript" src="../../lib/jquery.min.js"></script> + <script type="text/javascript" src="../../lib/jquery.panzoom.min.js"></script> + <script type="text/javascript" src="../../lib/jquery.mousewheel.min.js"></script> + <script type="text/javascript" src="../../lib/index.js"></script> + <script type="text/javascript" src="../../index.js"></script> + <script type="text/javascript" src="../../lib/scheduler.js"></script> + <script type="text/javascript" src="../../lib/template.js"></script> + + <script type="text/javascript"> + /* this variable can be used by the JS to determine the path to the root document */ + var toRoot = '../../'; + </script> + + </head> + <body> + <div id="search"> + <span id="doc-title">datafu-spark 2.0.0 API<span id="doc-version"></span></span> + <span class="close-results"><span class="left"><</span> Back</span> + <div id="textfilter"> + <span class="input"> + <input autocapitalize="none" placeholder="Search" id="index-input" type="text" accesskey="/" /> + <i class="clear material-icons">î </i> + <i id="search-icon" class="material-icons"></i> + </span> + </div> + </div> + <div id="search-results"> + <div id="search-progress"> + <div id="progress-fill"></div> + </div> + <div id="results-content"> + <div id="entity-results"></div> + <div id="member-results"></div> + </div> + </div> + <div id="content-scroll-container" style="-webkit-overflow-scrolling: touch;"> + <div id="content-container" style="-webkit-overflow-scrolling: touch;"> + <div id="subpackage-spacer"> + <div id="packages"> + <h1>Packages</h1> + <ul> + <li name="_root_.root" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="_root_"></a><a id="root:_root_"></a> + <span class="permalink"> + <a href="../../index.html" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier"></span> + <span class="kind">package</span> + </span> + <span class="symbol"> + <a title="" href="../../index.html"><span class="name">root</span></a> + </span> + + <div class="fullcomment"><dl class="attributes block"> <dt>Definition Classes</dt><dd><a href="../../index.html" class="extype" name="_root_">root</a></dd></dl></div> + </li><li name="_root_.datafu" visbl="pub" class="indented1 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="datafu"></a><a id="datafu:datafu"></a> + <span class="permalink"> + <a href="../../datafu/index.html" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier"></span> + <span class="kind">package</span> + </span> + <span class="symbol"> + <a title="" href="../index.html"><span class="name">datafu</span></a> + </span> + + <div class="fullcomment"><dl class="attributes block"> <dt>Definition Classes</dt><dd><a href="../../index.html" class="extype" name="_root_">root</a></dd></dl></div> + </li><li name="datafu.spark" visbl="pub" class="indented2 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="spark"></a><a id="spark:spark"></a> + <span class="permalink"> + <a href="../../datafu/spark/index.html" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier"></span> + <span class="kind">package</span> + </span> + <span class="symbol"> + <a title="" href="index.html"><span class="name">spark</span></a> + </span> + + <div class="fullcomment"><dl class="attributes block"> <dt>Definition Classes</dt><dd><a href="../index.html" class="extype" name="datafu">datafu</a></dd></dl></div> + </li><li class="current-entities indented2"> + <span class="separator"></span> + <a class="object" href="Aggregators$.html" title="This file contains UDAFs which extend the Aggregator class."></a> + <a href="Aggregators$.html" title="This file contains UDAFs which extend the Aggregator class.">Aggregators</a> + </li><li class="current-entities indented2"> + <span class="separator"></span> + <a class="class" href="CoreBridgeDirectory.html" title="Contains all python files needed by the bridge itself"></a> + <a href="CoreBridgeDirectory.html" title="Contains all python files needed by the bridge itself">CoreBridgeDirectory</a> + </li><li class="current-entities indented2"> + <span class="separator"></span> + <a class="object" href="DataFrameOps$.html" title="implicit class to enable easier usage e.g:"></a> + <a href="DataFrameOps$.html" title="implicit class to enable easier usage e.g:">DataFrameOps</a> + </li><li class="current-entities indented2"> + <span class="separator"></span> + <a class="object" href="PythonPathsManager$.html" title="There are two phases of resolving python files path:"></a> + <a href="PythonPathsManager$.html" title="There are two phases of resolving python files path:">PythonPathsManager</a> + </li><li class="current-entities indented2"> + <span class="separator"></span> + <a class="class" href="PythonResource.html" title="Represents a resource that needs to be added to PYTHONPATH used by ScalaPythonBridge."></a> + <a href="PythonResource.html" title="Represents a resource that needs to be added to PYTHONPATH used by ScalaPythonBridge.">PythonResource</a> + </li><li class="current-entities indented2"> + <span class="separator"></span> + <a class="object" href="ResourceCloning$.html" title="Utility for extracting resource from a jar and copy it to a temporary location"></a> + <a href="ResourceCloning$.html" title="Utility for extracting resource from a jar and copy it to a temporary location">ResourceCloning</a> + </li><li class="current-entities indented2"> + <span class="separator"></span> + <a class="object" href="ScalaPythonBridge$.html" title="Do not instantiate this class! Use the companion object instead."></a> + <a href="ScalaPythonBridge$.html" title="Do not instantiate this class! Use the companion object instead.">ScalaPythonBridge</a> + </li><li class="current-entities indented2"> + <span class="separator"></span> + <a class="class" href="ScalaPythonBridgeRunner.html" title="this class let's the user invoke PySpark code from scala example usage:"></a> + <a href="ScalaPythonBridgeRunner.html" title="this class let's the user invoke PySpark code from scala example usage:">ScalaPythonBridgeRunner</a> + </li><li class="current-entities indented2"> + <span class="separator"></span> + <a class="object" href="" title=""></a> + <a href="" title="">SparkDFUtils</a> + </li><li class="current-entities indented2"> + <span class="separator"></span> + <a class="class" href="SparkDFUtilsBridge.html" title="class definition so we could expose this functionality in PySpark"></a> + <a href="SparkDFUtilsBridge.html" title="class definition so we could expose this functionality in PySpark">SparkDFUtilsBridge</a> + </li><li class="current-entities indented2"> + <span class="separator"></span> + <a class="object" href="SparkUDAFs$.html" title="UserDefineAggregateFunction is deprecated and will be removed in DataFu 2.1.0 in order to allow compilation with Spark 3.2 and up."></a> + <a href="SparkUDAFs$.html" title="UserDefineAggregateFunction is deprecated and will be removed in DataFu 2.1.0 in order to allow compilation with Spark 3.2 and up.">SparkUDAFs</a> + </li> + </ul> + </div> + </div> + <div id="content"> + <body class="object value"> + <div id="definition"> + <div class="big-circle object">o</div> + <p id="owner"><a href="../index.html" class="extype" name="datafu">datafu</a>.<a href="index.html" class="extype" name="datafu.spark">spark</a></p> + <h1>SparkDFUtils<span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span></h1> + <h3><span class="morelinks"></span></h3> + </div> + + <h4 id="signature" class="signature"> + <span class="modifier_kind"> + <span class="modifier"></span> + <span class="kind">object</span> + </span> + <span class="symbol"> + <span class="name">SparkDFUtils</span> + </span> + </h4> + + + <div id="comment" class="fullcommenttop"><div class="toggleContainer block"> + <span class="toggle"> + Linear Supertypes + </span> + <div class="superTypes hiddenContent"><span class="extype" name="scala.AnyRef">AnyRef</span>, <span class="extype" name="scala.Any">Any</span></div> + </div></div> + + + <div id="mbrsel"> + <div class="toggle"></div> + <div id="memberfilter"> + <i class="material-icons arrow">î·</i> + <span class="input"> + <input id="mbrsel-input" placeholder="Filter all members" type="text" accesskey="/" /> + </span> + <i class="clear material-icons">î </i> + </div> + <div id="filterby"> + <div id="order"> + <span class="filtertype">Ordering</span> + <ol> + + <li class="alpha in"><span>Alphabetic</span></li> + <li class="inherit out"><span>By Inheritance</span></li> + </ol> + </div> + <div class="ancestors"> + <span class="filtertype">Inherited<br /> + </span> + <ol id="linearization"> + <li class="in" name="datafu.spark.SparkDFUtils"><span>SparkDFUtils</span></li><li class="in" name="scala.AnyRef"><span>AnyRef</span></li><li class="in" name="scala.Any"><span>Any</span></li> + </ol> + </div><div class="ancestors"> + <span class="filtertype"></span> + <ol> + <li class="hideall out"><span>Hide All</span></li> + <li class="showall in"><span>Show All</span></li> + </ol> + </div> + <div id="visbl"> + <span class="filtertype">Visibility</span> + <ol><li class="public in"><span>Public</span></li><li class="all out"><span>All</span></li></ol> + </div> + </div> + </div> + + <div id="template"> + <div id="allMembers"> + + + + + + + <div class="values members"> + <h3>Value Members</h3> + <ol> + <li name="scala.AnyRef#!=" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="!=(x$1:Any):Boolean"></a><a id="!=(Any):Boolean"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#!=(x$1:Any):Boolean" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier">final </span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span title="gt4s: $bang$eq" class="name">!=</span><span class="params">(<span name="arg0">arg0: <span class="extype" name="scala.Any">Any</span></span>)</span><span class="result">: <span class="extype" name="scala.Boolean">Boolean</span></span> + </span> + + <div class="fullcomment"><dl class="attributes block"> <dt>Definition Classes</dt><dd>AnyRef â Any</dd></dl></div> + </li><li name="scala.AnyRef###" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="##():Int"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html###():Int" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier">final </span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span title="gt4s: $hash$hash" class="name">##</span><span class="params">()</span><span class="result">: <span class="extype" name="scala.Int">Int</span></span> + </span> + + <div class="fullcomment"><dl class="attributes block"> <dt>Definition Classes</dt><dd>AnyRef â Any</dd></dl></div> + </li><li name="scala.AnyRef#==" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="==(x$1:Any):Boolean"></a><a id="==(Any):Boolean"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#==(x$1:Any):Boolean" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier">final </span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span title="gt4s: $eq$eq" class="name">==</span><span class="params">(<span name="arg0">arg0: <span class="extype" name="scala.Any">Any</span></span>)</span><span class="result">: <span class="extype" name="scala.Boolean">Boolean</span></span> + </span> + + <div class="fullcomment"><dl class="attributes block"> <dt>Definition Classes</dt><dd>AnyRef â Any</dd></dl></div> + </li><li name="scala.Any#asInstanceOf" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="asInstanceOf[T0]:T0"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#asInstanceOf[T0]:T0" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier">final </span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">asInstanceOf</span><span class="tparams">[<span name="T0">T0</span>]</span><span class="result">: <span class="extype" name="scala.Any.asInstanceOf.T0">T0</span></span> + </span> + + <div class="fullcomment"><dl class="attributes block"> <dt>Definition Classes</dt><dd>Any</dd></dl></div> + </li><li name="datafu.spark.SparkDFUtils#broadcastJoinSkewed" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="broadcastJoinSkewed(notSkewed:org.apache.spark.sql.DataFrame,skewed:org.apache.spark.sql.DataFrame,joinCol:String,numRowsToBroadcast:Int,filterCnt:Option[Long],joinType:String):org.apache.spark.sql.DataFrame"></a><a id="broadcastJoinSkewed(DataFrame,DataFrame,String,Int,Option[Long],String):DataFrame"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#broadcastJoinSkewed(notSkewed:org.apache.spark.sql.DataFrame,skewed:org.apache.spark.sql.DataFrame,joinCol:String,numRowsToBroadcast:Int,filterCnt:Option[Long],joinType:String):org.apache.spark.sql.DataFrame" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier"></span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">broadcastJoinSkewed</span><span class="params">(<span name="notSkewed">notSkewed: <a href="../../org/apache/spark/sql/index.html#DataFrame=org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]" class="extmbr" name="org.apache.spark.sql.DataFrame">DataFrame</a></span>, <span name="skewed">skewed: <a href="../../org/apache/spark/sql/index.html#DataFrame=org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]" class="extmbr" name="org.apache.spark.sql.DataFrame">DataFrame</a></span>, <span name="joinCol">joinCol: <span class="extype" name="scala.Predef.String">String</span></span>, <span name="numRowsToBroadcast">numRowsToBroadcast: <span class="extype" name="scala.Int">Int</span></span>, <span name="filterCnt">filterCnt: <span class="extype" name="scala.Option">Option</span>[<span class="extype" name="scala.Long">Long</span>] = <span class="symbol">None</span></span>, <span name="joinType">joinType: <span class="extype" name="scala.Predef.String">String</spa n> = <span class="symbol">"inner"</span></span>)</span><span class="result">: <a href="../../org/apache/spark/sql/index.html#DataFrame=org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]" class="extmbr" name="org.apache.spark.sql.DataFrame">DataFrame</a></span> + </span> + + <p class="shortcomment cmt">Suitable to perform a join in cases when one DF is skewed and the other is not skewed.</p><div class="fullcomment"><div class="comment cmt"><p>Suitable to perform a join in cases when one DF is skewed and the other is not skewed. +splits both of the DFs to two parts according to the skewed keys. +1. Map-join: broadcasts the skewed-keys part of the not skewed DF to the skewed-keys + part of the skewed DF +2. Regular join: between the remaining two parts. +</p></div><dl class="paramcmts block"><dt class="param">notSkewed</dt><dd class="cmt"><p>not skewed DataFrame</p></dd><dt class="param">skewed</dt><dd class="cmt"><p>skewed DataFrame</p></dd><dt class="param">joinCol</dt><dd class="cmt"><p>join column</p></dd><dt class="param">numRowsToBroadcast</dt><dd class="cmt"><p>num of rows to broadcast</p></dd><dt class="param">filterCnt</dt><dd class="cmt"><p>filter out unskewed rows from the boardcast to ease limit calculation</p></dd><dt>returns</dt><dd class="cmt"><p>DataFrame representing the data after the operation</p></dd></dl></div> + </li><li name="datafu.spark.SparkDFUtils#changeSchema" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="changeSchema(df:org.apache.spark.sql.DataFrame,newScheme:String*):org.apache.spark.sql.DataFrame"></a><a id="changeSchema(DataFrame,String*):DataFrame"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#changeSchema(df:org.apache.spark.sql.DataFrame,newScheme:String*):org.apache.spark.sql.DataFrame" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier"></span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">changeSchema</span><span class="params">(<span name="df">df: <a href="../../org/apache/spark/sql/index.html#DataFrame=org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]" class="extmbr" name="org.apache.spark.sql.DataFrame">DataFrame</a></span>, <span name="newScheme">newScheme: <span class="extype" name="scala.Predef.String">String</span>*</span>)</span><span class="result">: <a href="../../org/apache/spark/sql/index.html#DataFrame=org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]" class="extmbr" name="org.apache.spark.sql.DataFrame">DataFrame</a></span> + </span> + + <p class="shortcomment cmt">Returns a DataFrame with the column names renamed to the column names in the new schema +</p><div class="fullcomment"><div class="comment cmt"><p>Returns a DataFrame with the column names renamed to the column names in the new schema +</p></div><dl class="paramcmts block"><dt class="param">df</dt><dd class="cmt"><p>DataFrame to operate on</p></dd><dt class="param">newScheme</dt><dd class="cmt"><p>new column names</p></dd><dt>returns</dt><dd class="cmt"><p>DataFrame representing the data after the operation</p></dd></dl></div> + </li><li name="scala.AnyRef#clone" visbl="prt" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="clone():Object"></a><a id="clone():AnyRef"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#clone():Object" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier"></span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">clone</span><span class="params">()</span><span class="result">: <span class="extype" name="scala.AnyRef">AnyRef</span></span> + </span> + + <div class="fullcomment"><dl class="attributes block"> <dt>Attributes</dt><dd>protected[<span class="extype" name="java.lang">lang</span>] </dd><dt>Definition Classes</dt><dd>AnyRef</dd><dt>Annotations</dt><dd> + <span class="name">@throws</span><span class="args">(<span> + + <span class="defval" name="classOf[java.lang.CloneNotSupportedException]">...</span> + </span>)</span> + + <span class="name">@native</span><span class="args">()</span> + + </dd></dl></div> + </li><li name="datafu.spark.SparkDFUtils#dedupRandomN" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="dedupRandomN(df:org.apache.spark.sql.DataFrame,groupCol:org.apache.spark.sql.Column,maxSize:Int):org.apache.spark.sql.DataFrame"></a><a id="dedupRandomN(DataFrame,Column,Int):DataFrame"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#dedupRandomN(df:org.apache.spark.sql.DataFrame,groupCol:org.apache.spark.sql.Column,maxSize:Int):org.apache.spark.sql.DataFrame" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier"></span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">dedupRandomN</span><span class="params">(<span name="df">df: <a href="../../org/apache/spark/sql/index.html#DataFrame=org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]" class="extmbr" name="org.apache.spark.sql.DataFrame">DataFrame</a></span>, <span name="groupCol">groupCol: <span class="extype" name="org.apache.spark.sql.Column">Column</span></span>, <span name="maxSize">maxSize: <span class="extype" name="scala.Int">Int</span></span>)</span><span class="result">: <a href="../../org/apache/spark/sql/index.html#DataFrame=org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]" class="extmbr" name="org.apache.spark.sql.DataFrame">DataFrame</a></span> + </span> + + <p class="shortcomment cmt">Used get the random n records in each group.</p><div class="fullcomment"><div class="comment cmt"><p>Used get the random n records in each group. Uses an efficient implementation +that doesn't order the data so it can handle large amounts of data. +</p></div><dl class="paramcmts block"><dt class="param">df</dt><dd class="cmt"><p>DataFrame to operate on</p></dd><dt class="param">groupCol</dt><dd class="cmt"><p>column to group by the records</p></dd><dt class="param">maxSize</dt><dd class="cmt"><p>The maximal number of rows per group</p></dd><dt>returns</dt><dd class="cmt"><p>DataFrame representing the data after the operation</p></dd></dl></div> + </li><li name="datafu.spark.SparkDFUtils#dedupTopN" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="dedupTopN(df:org.apache.spark.sql.DataFrame,n:Int,groupCol:org.apache.spark.sql.Column,orderCols:org.apache.spark.sql.Column*):org.apache.spark.sql.DataFrame"></a><a id="dedupTopN(DataFrame,Int,Column,Column*):DataFrame"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#dedupTopN(df:org.apache.spark.sql.DataFrame,n:Int,groupCol:org.apache.spark.sql.Column,orderCols:org.apache.spark.sql.Column*):org.apache.spark.sql.DataFrame" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier"></span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">dedupTopN</span><span class="params">(<span name="df">df: <a href="../../org/apache/spark/sql/index.html#DataFrame=org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]" class="extmbr" name="org.apache.spark.sql.DataFrame">DataFrame</a></span>, <span name="n">n: <span class="extype" name="scala.Int">Int</span></span>, <span name="groupCol">groupCol: <span class="extype" name="org.apache.spark.sql.Column">Column</span></span>, <span name="orderCols">orderCols: <span class="extype" name="org.apache.spark.sql.Column">Column</span>*</span>)</span><span class="result">: <a href="../../org/apache/spark/sql/index.html#DataFrame=org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]" class="extmbr" name="org.apache.spark.sql.DataFrame">DataFrame</a></span> + </span> + + <p class="shortcomment cmt">Used get the top N records (after ordering according to the provided order columns) +in each group.</p><div class="fullcomment"><div class="comment cmt"><p>Used get the top N records (after ordering according to the provided order columns) +in each group. +</p></div><dl class="paramcmts block"><dt class="param">df</dt><dd class="cmt"><p>DataFrame to operate on</p></dd><dt class="param">n</dt><dd class="cmt"><p>number of records to return from each group</p></dd><dt class="param">groupCol</dt><dd class="cmt"><p>column to group by the records</p></dd><dt class="param">orderCols</dt><dd class="cmt"><p>columns to order the records according to</p></dd><dt>returns</dt><dd class="cmt"><p>DataFrame representing the data after the operation</p></dd></dl></div> + </li><li name="datafu.spark.SparkDFUtils#dedupWithCombiner" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="dedupWithCombiner(df:org.apache.spark.sql.DataFrame,groupCol:Seq[org.apache.spark.sql.Column],orderByCol:Seq[org.apache.spark.sql.Column],desc:Boolean,moreAggFunctions:Seq[org.apache.spark.sql.Column],columnsFilter:Seq[String],columnsFilterKeep:Boolean):org.apache.spark.sql.DataFrame"></a><a id="dedupWithCombiner(DataFrame,Seq[Column],Seq[Column],Boolean,Seq[Column],Seq[String],Boolean):DataFrame"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#dedupWithCombiner(df:org.apache.spark.sql.DataFrame,groupCol:Seq[org.apache.spark.sql.Column],orderByCol:Seq[org.apache.spark.sql.Column],desc:Boolean,moreAggFunctions:Seq[org.apache.spark.sql.Column],columnsFilter:Seq[String],columnsFilterKeep:Boolean):org.apache.spark.sql.DataFrame" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier"></span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">dedupWithCombiner</span><span class="params">(<span name="df">df: <a href="../../org/apache/spark/sql/index.html#DataFrame=org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]" class="extmbr" name="org.apache.spark.sql.DataFrame">DataFrame</a></span>, <span name="groupCol">groupCol: <span class="extype" name="scala.Seq">Seq</span>[<span class="extype" name="org.apache.spark.sql.Column">Column</span>]</span>, <span name="orderByCol">orderByCol: <span class="extype" name="scala.Seq">Seq</span>[<span class="extype" name="org.apache.spark.sql.Column">Column</span>]</span>, <span name="desc">desc: <span class="extype" name="scala.Boolean">Boolean</span> = <span class="symbol">true</span></span>, <span name="moreAggFunctions">moreAggFunctions: <span class="extype" name="scala.Seq">Seq</span>[<span class="extype" name="org.apache.spark.sql.Column">Column</span>] = <span class="symbol">Nil</span></span>, <span name="columnsFilter">columnsFilter: <span class="ext ype" name="scala.Seq">Seq</span>[<span class="extype" name="scala.Predef.String">String</span>] = <span class="symbol">Nil</span></span>, <span name="columnsFilterKeep">columnsFilterKeep: <span class="extype" name="scala.Boolean">Boolean</span> = <span class="symbol">true</span></span>)</span><span class="result">: <a href="../../org/apache/spark/sql/index.html#DataFrame=org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]" class="extmbr" name="org.apache.spark.sql.DataFrame">DataFrame</a></span> + </span> + + <p class="shortcomment cmt">Used to get the 'latest' record (after ordering according to the provided order columns) +in each group.</p><div class="fullcomment"><div class="comment cmt"><p>Used to get the 'latest' record (after ordering according to the provided order columns) +in each group. +the same functionality as <code><a href="#dedupWithOrder(df:org.apache.spark.sql.DataFrame,groupCol:org.apache.spark.sql.Column,orderCols:org.apache.spark.sql.Column*):org.apache.spark.sql.DataFrame" class="extmbr" name="datafu.spark.SparkDFUtils#dedupWithOrder">#dedupWithOrder</a></code> but implemented using UDAF to utilize +map side aggregation. +this function should be used in cases when you expect a large number of rows to get combined, +as they share the same group column. +</p></div><dl class="paramcmts block"><dt class="param">df</dt><dd class="cmt"><p>DataFrame to operate on</p></dd><dt class="param">groupCol</dt><dd class="cmt"><p>column to group by the records</p></dd><dt class="param">orderByCol</dt><dd class="cmt"><p>column to order the records according to</p></dd><dt class="param">desc</dt><dd class="cmt"><p>have the order as desc</p></dd><dt class="param">moreAggFunctions</dt><dd class="cmt"><p>more aggregate functions</p></dd><dt class="param">columnsFilter</dt><dd class="cmt"><p>columns to filter</p></dd><dt class="param">columnsFilterKeep</dt><dd class="cmt"><p>indicates whether we should filter the selected columns 'out' + or alternatively have only those columns in the result</p></dd><dt>returns</dt><dd class="cmt"><p>DataFrame representing the data after the operation</p></dd></dl></div> + </li><li name="datafu.spark.SparkDFUtils#dedupWithOrder" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="dedupWithOrder(df:org.apache.spark.sql.DataFrame,groupCol:org.apache.spark.sql.Column,orderCols:org.apache.spark.sql.Column*):org.apache.spark.sql.DataFrame"></a><a id="dedupWithOrder(DataFrame,Column,Column*):DataFrame"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#dedupWithOrder(df:org.apache.spark.sql.DataFrame,groupCol:org.apache.spark.sql.Column,orderCols:org.apache.spark.sql.Column*):org.apache.spark.sql.DataFrame" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier"></span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">dedupWithOrder</span><span class="params">(<span name="df">df: <a href="../../org/apache/spark/sql/index.html#DataFrame=org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]" class="extmbr" name="org.apache.spark.sql.DataFrame">DataFrame</a></span>, <span name="groupCol">groupCol: <span class="extype" name="org.apache.spark.sql.Column">Column</span></span>, <span name="orderCols">orderCols: <span class="extype" name="org.apache.spark.sql.Column">Column</span>*</span>)</span><span class="result">: <a href="../../org/apache/spark/sql/index.html#DataFrame=org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]" class="extmbr" name="org.apache.spark.sql.DataFrame">DataFrame</a></span> + </span> + + <p class="shortcomment cmt">Used to get the 'latest' record (after ordering according to the provided order columns) +in each group.</p><div class="fullcomment"><div class="comment cmt"><p>Used to get the 'latest' record (after ordering according to the provided order columns) +in each group. +Different from <code><span class="extype" name="org.apache.spark.sql.Dataset#dropDuplicates">org.apache.spark.sql.Dataset#dropDuplicates</span></code> because order matters. +</p></div><dl class="paramcmts block"><dt class="param">df</dt><dd class="cmt"><p>DataFrame to operate on</p></dd><dt class="param">groupCol</dt><dd class="cmt"><p>column to group by the records</p></dd><dt class="param">orderCols</dt><dd class="cmt"><p>columns to order the records according to</p></dd><dt>returns</dt><dd class="cmt"><p>DataFrame representing the data after the operation</p></dd></dl></div> + </li><li name="scala.AnyRef#eq" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="eq(x$1:AnyRef):Boolean"></a><a id="eq(AnyRef):Boolean"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#eq(x$1:AnyRef):Boolean" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier">final </span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">eq</span><span class="params">(<span name="arg0">arg0: <span class="extype" name="scala.AnyRef">AnyRef</span></span>)</span><span class="result">: <span class="extype" name="scala.Boolean">Boolean</span></span> + </span> + + <div class="fullcomment"><dl class="attributes block"> <dt>Definition Classes</dt><dd>AnyRef</dd></dl></div> + </li><li name="scala.AnyRef#equals" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="equals(x$1:Any):Boolean"></a><a id="equals(Any):Boolean"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#equals(x$1:Any):Boolean" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier"></span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">equals</span><span class="params">(<span name="arg0">arg0: <span class="extype" name="scala.Any">Any</span></span>)</span><span class="result">: <span class="extype" name="scala.Boolean">Boolean</span></span> + </span> + + <div class="fullcomment"><dl class="attributes block"> <dt>Definition Classes</dt><dd>AnyRef â Any</dd></dl></div> + </li><li name="datafu.spark.SparkDFUtils#explodeArray" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="explodeArray(df:org.apache.spark.sql.DataFrame,arrayCol:org.apache.spark.sql.Column,alias:String):org.apache.spark.sql.DataFrame"></a><a id="explodeArray(DataFrame,Column,String):DataFrame"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#explodeArray(df:org.apache.spark.sql.DataFrame,arrayCol:org.apache.spark.sql.Column,alias:String):org.apache.spark.sql.DataFrame" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier"></span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">explodeArray</span><span class="params">(<span name="df">df: <a href="../../org/apache/spark/sql/index.html#DataFrame=org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]" class="extmbr" name="org.apache.spark.sql.DataFrame">DataFrame</a></span>, <span name="arrayCol">arrayCol: <span class="extype" name="org.apache.spark.sql.Column">Column</span></span>, <span name="alias">alias: <span class="extype" name="scala.Predef.String">String</span></span>)</span><span class="result">: <a href="../../org/apache/spark/sql/index.html#DataFrame=org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]" class="extmbr" name="org.apache.spark.sql.DataFrame">DataFrame</a></span> + </span> + + <p class="shortcomment cmt">Given an array column that you need to explode into different columns, use this method.</p><div class="fullcomment"><div class="comment cmt"><p>Given an array column that you need to explode into different columns, use this method. +This function counts the number of output columns by executing the Spark job internally on the input array column. +Consider caching the input dataframe if this is an expensive operation. +</p></div><dl class="paramcmts block"><dt>returns</dt><dd class="cmt"><p> +input ++-----+----------------------------------------+</p><table class="doctbl"> + <thead> + <tr><th class="doctbl-left"><p>label</p></th><th class="doctbl-left"><p>sentence_arr </p></th></tr> + </thead> + + </table><p>+-----+----------------------------------------+</p><table class="doctbl"> + <thead> + <tr><th class="doctbl-right"><p>0.0 </p></th><th class="doctbl-right"><p>[Hi, I heard, about, Spark] </p></th></tr> + </thead> + <tbody><tr><td class="doctbl-right"><p>1.0 </p></td><td class="doctbl-right"><p>[Logistic, regression, models, are neat]</p></td></tr> + </tbody> + </table><p>+-----+----------------------------------------+ +output ++-----+----------------------------------------+--------+----------+---------+------------+</p><table class="doctbl"> + <thead> + <tr><th class="doctbl-left"><p>label</p></th><th class="doctbl-left"><p>sentence_arr </p></th><th class="doctbl-left"><p>token0 </p></th><th class="doctbl-left"><p>token1 </p></th><th class="doctbl-left"><p>token2 </p></th><th class="doctbl-left"><p>token3 </p></th></tr> + </thead> + + </table><p>+-----+----------------------------------------+--------+----------+---------+------------+</p><table class="doctbl"> + <thead> + <tr><th class="doctbl-right"><p>0.0 </p></th><th class="doctbl-right"><p>[Hi, I heard, about, Spark] </p></th><th class="doctbl-right"><p>Hi </p></th><th class="doctbl-right"><p>I heard </p></th><th class="doctbl-right"><p>about </p></th><th class="doctbl-right"><p>Spark </p></th></tr> + </thead> + <tbody><tr><td class="doctbl-right"><p>1.0 </p></td><td class="doctbl-right"><p>[Logistic, regression, models, are neat]</p></td><td class="doctbl-right"><p>Logistic</p></td><td class="doctbl-right"><p>regression</p></td><td class="doctbl-right"><p>models </p></td><td class="doctbl-right"><p>are neat </p></td></tr> + </tbody> + </table><p>+-----+----------------------------------------+--------+----------+---------+------------+</p></dd></dl></div> + </li><li name="scala.AnyRef#finalize" visbl="prt" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="finalize():Unit"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#finalize():Unit" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier"></span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">finalize</span><span class="params">()</span><span class="result">: <span class="extype" name="scala.Unit">Unit</span></span> + </span> + + <div class="fullcomment"><dl class="attributes block"> <dt>Attributes</dt><dd>protected[<span class="extype" name="java.lang">lang</span>] </dd><dt>Definition Classes</dt><dd>AnyRef</dd><dt>Annotations</dt><dd> + <span class="name">@throws</span><span class="args">(<span> + + <span class="symbol">classOf[java.lang.Throwable]</span> + </span>)</span> + + </dd></dl></div> + </li><li name="datafu.spark.SparkDFUtils#flatten" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="flatten(df:org.apache.spark.sql.DataFrame,colName:String):org.apache.spark.sql.DataFrame"></a><a id="flatten(DataFrame,String):DataFrame"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#flatten(df:org.apache.spark.sql.DataFrame,colName:String):org.apache.spark.sql.DataFrame" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier"></span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">flatten</span><span class="params">(<span name="df">df: <a href="../../org/apache/spark/sql/index.html#DataFrame=org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]" class="extmbr" name="org.apache.spark.sql.DataFrame">DataFrame</a></span>, <span name="colName">colName: <span class="extype" name="scala.Predef.String">String</span></span>)</span><span class="result">: <a href="../../org/apache/spark/sql/index.html#DataFrame=org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]" class="extmbr" name="org.apache.spark.sql.DataFrame">DataFrame</a></span> + </span> + + <p class="shortcomment cmt">Returns a DataFrame with the given column (should be a StructType) +replaced by its inner fields.</p><div class="fullcomment"><div class="comment cmt"><p>Returns a DataFrame with the given column (should be a StructType) +replaced by its inner fields. +This method only flattens a single level of nesting.</p><p>+-------+----------+----------+----------+</p><table class="doctbl"> + <thead> + <tr><th class="doctbl-left"><p>id </p></th><th class="doctbl-left"><p>s.sub_col1</p></th><th class="doctbl-left"><p>s.sub_col2</p></th><th class="doctbl-left"><p>s.sub_col3</p></th></tr> + </thead> + + </table><p>+-------+----------+----------+----------+</p><table class="doctbl"> + <thead> + <tr><th class="doctbl-left"><p>123 </p></th><th class="doctbl-left"><p>1 </p></th><th class="doctbl-left"><p>2 </p></th><th class="doctbl-left"><p>3 </p></th></tr> + </thead> + + </table><p>+-------+----------+----------+----------+</p><p>+-------+----------+----------+----------+</p><table class="doctbl"> + <thead> + <tr><th class="doctbl-left"><p>id </p></th><th class="doctbl-left"><p>sub_col1 </p></th><th class="doctbl-left"><p>sub_col2 </p></th><th class="doctbl-left"><p>sub_col3 </p></th></tr> + </thead> + + </table><p>+-------+----------+----------+----------+</p><table class="doctbl"> + <thead> + <tr><th class="doctbl-left"><p>123 </p></th><th class="doctbl-left"><p>1 </p></th><th class="doctbl-left"><p>2 </p></th><th class="doctbl-left"><p>3 </p></th></tr> + </thead> + + </table><p>+-------+----------+----------+----------+ +</p></div><dl class="paramcmts block"><dt class="param">df</dt><dd class="cmt"><p>DataFrame to operate on</p></dd><dt class="param">colName</dt><dd class="cmt"><p>column name for a column of type StructType</p></dd><dt>returns</dt><dd class="cmt"><p>DataFrame representing the data after the operation</p></dd></dl></div> + </li><li name="scala.AnyRef#getClass" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="getClass():Class[_]"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#getClass():Class[_]" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier">final </span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">getClass</span><span class="params">()</span><span class="result">: <span class="extype" name="java.lang.Class">Class</span>[_]</span> + </span> + + <div class="fullcomment"><dl class="attributes block"> <dt>Definition Classes</dt><dd>AnyRef â Any</dd><dt>Annotations</dt><dd> + <span class="name">@native</span><span class="args">()</span> + + </dd></dl></div> + </li><li name="scala.AnyRef#hashCode" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="hashCode():Int"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#hashCode():Int" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier"></span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">hashCode</span><span class="params">()</span><span class="result">: <span class="extype" name="scala.Int">Int</span></span> + </span> + + <div class="fullcomment"><dl class="attributes block"> <dt>Definition Classes</dt><dd>AnyRef â Any</dd><dt>Annotations</dt><dd> + <span class="name">@native</span><span class="args">()</span> + + </dd></dl></div> + </li><li name="scala.Any#isInstanceOf" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="isInstanceOf[T0]:Boolean"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#isInstanceOf[T0]:Boolean" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier">final </span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">isInstanceOf</span><span class="tparams">[<span name="T0">T0</span>]</span><span class="result">: <span class="extype" name="scala.Boolean">Boolean</span></span> + </span> + + <div class="fullcomment"><dl class="attributes block"> <dt>Definition Classes</dt><dd>Any</dd></dl></div> + </li><li name="datafu.spark.SparkDFUtils#joinSkewed" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="joinSkewed(dfLeft:org.apache.spark.sql.DataFrame,dfRight:org.apache.spark.sql.DataFrame,joinExprs:org.apache.spark.sql.Column,numShards:Int,joinType:String):org.apache.spark.sql.DataFrame"></a><a id="joinSkewed(DataFrame,DataFrame,Column,Int,String):DataFrame"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#joinSkewed(dfLeft:org.apache.spark.sql.DataFrame,dfRight:org.apache.spark.sql.DataFrame,joinExprs:org.apache.spark.sql.Column,numShards:Int,joinType:String):org.apache.spark.sql.DataFrame" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier"></span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">joinSkewed</span><span class="params">(<span name="dfLeft">dfLeft: <a href="../../org/apache/spark/sql/index.html#DataFrame=org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]" class="extmbr" name="org.apache.spark.sql.DataFrame">DataFrame</a></span>, <span name="dfRight">dfRight: <a href="../../org/apache/spark/sql/index.html#DataFrame=org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]" class="extmbr" name="org.apache.spark.sql.DataFrame">DataFrame</a></span>, <span name="joinExprs">joinExprs: <span class="extype" name="org.apache.spark.sql.Column">Column</span></span>, <span name="numShards">numShards: <span class="extype" name="scala.Int">Int</span> = <span class="symbol">10</span></span>, <span name="joinType">joinType: <span class="extype" name="scala.Predef.String">String</span> = <span class="symbol">"inner"</span></span>)</span><span class="result">: <a href="../../org/apache/spark/sql/index.html#DataFrame=org.apache.spark.sql.Data set[org.apache.spark.sql.Row]" class="extmbr" name="org.apache.spark.sql.DataFrame">DataFrame</a></span> + </span> + + <p class="shortcomment cmt">Used to perform a join when the right df is relatively small +but still too big to fit in memory to perform map side broadcast join.</p><div class="fullcomment"><div class="comment cmt"><p>Used to perform a join when the right df is relatively small +but still too big to fit in memory to perform map side broadcast join. +Use cases: +a. excluding keys that might be skewed from a medium size list. +b. join a big skewed table with a table that has small number of very large rows. +</p></div><dl class="paramcmts block"><dt class="param">dfLeft</dt><dd class="cmt"><p>left DataFrame</p></dd><dt class="param">dfRight</dt><dd class="cmt"><p>right DataFrame</p></dd><dt class="param">joinExprs</dt><dd class="cmt"><p>join expression</p></dd><dt class="param">numShards</dt><dd class="cmt"><p>number of shards - number of times to duplicate the right DataFrame</p></dd><dt class="param">joinType</dt><dd class="cmt"><p>join type</p></dd><dt>returns</dt><dd class="cmt"><p>joined DataFrame</p></dd></dl></div> + </li><li name="datafu.spark.SparkDFUtils#joinWithRange" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="joinWithRange(dfSingle:org.apache.spark.sql.DataFrame,colSingle:String,dfRange:org.apache.spark.sql.DataFrame,colRangeStart:String,colRangeEnd:String,DECREASE_FACTOR:Long):org.apache.spark.sql.DataFrame"></a><a id="joinWithRange(DataFrame,String,DataFrame,String,String,Long):DataFrame"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#joinWithRange(dfSingle:org.apache.spark.sql.DataFrame,colSingle:String,dfRange:org.apache.spark.sql.DataFrame,colRangeStart:String,colRangeEnd:String,DECREASE_FACTOR:Long):org.apache.spark.sql.DataFrame" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier"></span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">joinWithRange</span><span class="params">(<span name="dfSingle">dfSingle: <a href="../../org/apache/spark/sql/index.html#DataFrame=org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]" class="extmbr" name="org.apache.spark.sql.DataFrame">DataFrame</a></span>, <span name="colSingle">colSingle: <span class="extype" name="scala.Predef.String">String</span></span>, <span name="dfRange">dfRange: <a href="../../org/apache/spark/sql/index.html#DataFrame=org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]" class="extmbr" name="org.apache.spark.sql.DataFrame">DataFrame</a></span>, <span name="colRangeStart">colRangeStart: <span class="extype" name="scala.Predef.String">String</span></span>, <span name="colRangeEnd">colRangeEnd: <span class="extype" name="scala.Predef.String">String</span></span>, <span name="DECREASE_FACTOR">DECREASE_FACTOR: <span class="extype" name="scala.Long">Long</span></span>)</span><span class="result">: <a href="../../org/apache/spark/ sql/index.html#DataFrame=org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]" class="extmbr" name="org.apache.spark.sql.DataFrame">DataFrame</a></span> + </span> + + <p class="shortcomment cmt">Helper function to join a table with point column to a table with range column.</p><div class="fullcomment"><div class="comment cmt"><p>Helper function to join a table with point column to a table with range column. +For example, join a table that contains specific time in minutes with a table that +contains time ranges. +The main problem this function addresses is that doing naive explode on the ranges can result +in a huge table. +requires: +1. point table needs to be distinct on the point column. +2. the range and point columns need to be numeric.</p><p>TIMES: ++-------+</p><table class="doctbl"> + <thead> + <tr><th class="doctbl-left"><p>time </p></th></tr> + </thead> + + </table><p>+-------+</p><table class="doctbl"> + <thead> + <tr><th class="doctbl-left"><p>11:55 </p></th></tr> + </thead> + + </table><p>+-------+</p><p>TIME RANGES: ++----------+---------+----------+</p><table class="doctbl"> + <thead> + <tr><th class="doctbl-left"><p>start_time</p></th><th class="doctbl-left"><p>end_time </p></th><th class="doctbl-left"><p>desc </p></th></tr> + </thead> + + </table><p>+----------+---------+----------+</p><table class="doctbl"> + <thead> + <tr><th class="doctbl-left"><p>10:00 </p></th><th class="doctbl-left"><p>12:00 </p></th><th class="doctbl-left"><p> meeting </p></th></tr> + </thead> + + </table><p>+----------+---------+----------+</p><table class="doctbl"> + <thead> + <tr><th class="doctbl-left"><p>11:50 </p></th><th class="doctbl-left"><p>12:15 </p></th><th class="doctbl-left"><p> lunch </p></th></tr> + </thead> + + </table><p>+----------+---------+----------+</p><p>OUTPUT: ++-------+----------+---------+---------+</p><table class="doctbl"> + <thead> + <tr><th class="doctbl-left"><p>time </p></th><th class="doctbl-left"><p>start_time</p></th><th class="doctbl-left"><p>end_time </p></th><th class="doctbl-left"><p>desc </p></th></tr> + </thead> + + </table><p>+-------+----------+---------+---------+</p><table class="doctbl"> + <thead> + <tr><th class="doctbl-left"><p>11:55 </p></th><th class="doctbl-left"><p>10:00 </p></th><th class="doctbl-left"><p>12:00 </p></th><th class="doctbl-left"><p> meeting </p></th></tr> + </thead> + + </table><p>+-------+----------+---------+---------+</p><table class="doctbl"> + <thead> + <tr><th class="doctbl-left"><p>11:55 </p></th><th class="doctbl-left"><p>11:50 </p></th><th class="doctbl-left"><p>12:15 </p></th><th class="doctbl-left"><p> lunch </p></th></tr> + </thead> + + </table><p>+-------+----------+---------+---------+ +</p></div><dl class="paramcmts block"><dt class="param">dfSingle</dt><dd class="cmt"><p>DataFrame that contains the point column</p></dd><dt class="param">colSingle</dt><dd class="cmt"><p>the point column's name</p></dd><dt class="param">dfRange</dt><dd class="cmt"><p>DataFrame that contains the range column</p></dd><dt class="param">colRangeStart</dt><dd class="cmt"><p>the start range column's name</p></dd><dt class="param">colRangeEnd</dt><dd class="cmt"><p>the end range column's name</p></dd><dt class="param">DECREASE_FACTOR</dt><dd class="cmt"><p>resolution factor. instead of exploding the range column directly, + we first decrease its resolution by this factor</p></dd></dl></div> + </li><li name="datafu.spark.SparkDFUtils#joinWithRangeAndDedup" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="joinWithRangeAndDedup(dfSingle:org.apache.spark.sql.DataFrame,colSingle:String,dfRange:org.apache.spark.sql.DataFrame,colRangeStart:String,colRangeEnd:String,DECREASE_FACTOR:Long,dedupSmallRange:Boolean):org.apache.spark.sql.DataFrame"></a><a id="joinWithRangeAndDedup(DataFrame,String,DataFrame,String,String,Long,Boolean):DataFrame"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#joinWithRangeAndDedup(dfSingle:org.apache.spark.sql.DataFrame,colSingle:String,dfRange:org.apache.spark.sql.DataFrame,colRangeStart:String,colRangeEnd:String,DECREASE_FACTOR:Long,dedupSmallRange:Boolean):org.apache.spark.sql.DataFrame" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier"></span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">joinWithRangeAndDedup</span><span class="params">(<span name="dfSingle">dfSingle: <a href="../../org/apache/spark/sql/index.html#DataFrame=org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]" class="extmbr" name="org.apache.spark.sql.DataFrame">DataFrame</a></span>, <span name="colSingle">colSingle: <span class="extype" name="scala.Predef.String">String</span></span>, <span name="dfRange">dfRange: <a href="../../org/apache/spark/sql/index.html#DataFrame=org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]" class="extmbr" name="org.apache.spark.sql.DataFrame">DataFrame</a></span>, <span name="colRangeStart">colRangeStart: <span class="extype" name="scala.Predef.String">String</span></span>, <span name="colRangeEnd">colRangeEnd: <span class="extype" name="scala.Predef.String">String</span></span>, <span name="DECREASE_FACTOR">DECREASE_FACTOR: <span class="extype" name="scala.Long">Long</span></span>, <span name="dedupSmallRange">dedupSmallRange: <span c lass="extype" name="scala.Boolean">Boolean</span></span>)</span><span class="result">: <a href="../../org/apache/spark/sql/index.html#DataFrame=org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]" class="extmbr" name="org.apache.spark.sql.DataFrame">DataFrame</a></span> + </span> + + <p class="shortcomment cmt">Run joinWithRange and afterwards run dedup +</p><div class="fullcomment"><div class="comment cmt"><p>Run joinWithRange and afterwards run dedup +</p></div><dl class="paramcmts block"><dt class="param">dedupSmallRange</dt><dd class="cmt"><p>- by small/large range + OUTPUT for dedupSmallRange = "true": ++-------+----------+---------+---------+</p><table class="doctbl"> + <thead> + <tr><th class="doctbl-left"><p>time </p></th><th class="doctbl-left"><p>start_time</p></th><th class="doctbl-left"><p>end_time </p></th><th class="doctbl-left"><p>desc </p></th></tr> + </thead> + + </table><p>+-------+----------+---------+---------+</p><table class="doctbl"> + <thead> + <tr><th class="doctbl-left"><p>11:55 </p></th><th class="doctbl-left"><p>11:50 </p></th><th class="doctbl-left"><p>12:15 </p></th><th class="doctbl-left"><p> lunch </p></th></tr> + </thead> + + </table><p>+-------+----------+---------+---------+ + OUTPUT for dedupSmallRange = "false": ++-------+----------+---------+---------+</p><table class="doctbl"> + <thead> + <tr><th class="doctbl-left"><p>time </p></th><th class="doctbl-left"><p>start_time</p></th><th class="doctbl-left"><p>end_time </p></th><th class="doctbl-left"><p>desc </p></th></tr> + </thead> + + </table><p>+-------+----------+---------+---------+</p><table class="doctbl"> + <thead> + <tr><th class="doctbl-left"><p>11:55 </p></th><th class="doctbl-left"><p>10:00 </p></th><th class="doctbl-left"><p>12:00 </p></th><th class="doctbl-left"><p> meeting </p></th></tr> + </thead> + + </table><p>+-------+----------+---------+---------+</p></dd></dl></div> + </li><li name="scala.AnyRef#ne" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="ne(x$1:AnyRef):Boolean"></a><a id="ne(AnyRef):Boolean"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#ne(x$1:AnyRef):Boolean" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier">final </span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">ne</span><span class="params">(<span name="arg0">arg0: <span class="extype" name="scala.AnyRef">AnyRef</span></span>)</span><span class="result">: <span class="extype" name="scala.Boolean">Boolean</span></span> + </span> + + <div class="fullcomment"><dl class="attributes block"> <dt>Definition Classes</dt><dd>AnyRef</dd></dl></div> + </li><li name="scala.AnyRef#notify" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="notify():Unit"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#notify():Unit" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier">final </span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">notify</span><span class="params">()</span><span class="result">: <span class="extype" name="scala.Unit">Unit</span></span> + </span> + + <div class="fullcomment"><dl class="attributes block"> <dt>Definition Classes</dt><dd>AnyRef</dd><dt>Annotations</dt><dd> + <span class="name">@native</span><span class="args">()</span> + + </dd></dl></div> + </li><li name="scala.AnyRef#notifyAll" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="notifyAll():Unit"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#notifyAll():Unit" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier">final </span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">notifyAll</span><span class="params">()</span><span class="result">: <span class="extype" name="scala.Unit">Unit</span></span> + </span> + + <div class="fullcomment"><dl class="attributes block"> <dt>Definition Classes</dt><dd>AnyRef</dd><dt>Annotations</dt><dd> + <span class="name">@native</span><span class="args">()</span> + + </dd></dl></div> + </li><li name="scala.AnyRef#synchronized" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="synchronized[T0](x$1:=>T0):T0"></a><a id="synchronized[T0](âT0):T0"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#synchronized[T0](x$1:=>T0):T0" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier">final </span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">synchronized</span><span class="tparams">[<span name="T0">T0</span>]</span><span class="params">(<span name="arg0">arg0: â <span class="extype" name="java.lang.AnyRef.synchronized.T0">T0</span></span>)</span><span class="result">: <span class="extype" name="java.lang.AnyRef.synchronized.T0">T0</span></span> + </span> + + <div class="fullcomment"><dl class="attributes block"> <dt>Definition Classes</dt><dd>AnyRef</dd></dl></div> + </li><li name="scala.AnyRef#toString" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="toString():String"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#toString():String" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier"></span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">toString</span><span class="params">()</span><span class="result">: <span class="extype" name="java.lang.String">String</span></span> + </span> + + <div class="fullcomment"><dl class="attributes block"> <dt>Definition Classes</dt><dd>AnyRef â Any</dd></dl></div> + </li><li name="scala.AnyRef#wait" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="wait():Unit"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#wait():Unit" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier">final </span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">wait</span><span class="params">()</span><span class="result">: <span class="extype" name="scala.Unit">Unit</span></span> + </span> + + <div class="fullcomment"><dl class="attributes block"> <dt>Definition Classes</dt><dd>AnyRef</dd><dt>Annotations</dt><dd> + <span class="name">@throws</span><span class="args">(<span> + + <span class="defval" name="classOf[java.lang.InterruptedException]">...</span> + </span>)</span> + + </dd></dl></div> + </li><li name="scala.AnyRef#wait" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="wait(x$1:Long,x$2:Int):Unit"></a><a id="wait(Long,Int):Unit"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#wait(x$1:Long,x$2:Int):Unit" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier">final </span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">wait</span><span class="params">(<span name="arg0">arg0: <span class="extype" name="scala.Long">Long</span></span>, <span name="arg1">arg1: <span class="extype" name="scala.Int">Int</span></span>)</span><span class="result">: <span class="extype" name="scala.Unit">Unit</span></span> + </span> + + <div class="fullcomment"><dl class="attributes block"> <dt>Definition Classes</dt><dd>AnyRef</dd><dt>Annotations</dt><dd> + <span class="name">@throws</span><span class="args">(<span> + + <span class="defval" name="classOf[java.lang.InterruptedException]">...</span> + </span>)</span> + + </dd></dl></div> + </li><li name="scala.AnyRef#wait" visbl="pub" class="indented0 " data-isabs="false" fullComment="yes" group="Ungrouped"> + <a id="wait(x$1:Long):Unit"></a><a id="wait(Long):Unit"></a> + <span class="permalink"> + <a href="../../datafu/spark/SparkDFUtils$.html#wait(x$1:Long):Unit" title="Permalink"> + <i class="material-icons">î </i> + </a> + </span> + <span class="modifier_kind"> + <span class="modifier">final </span> + <span class="kind">def</span> + </span> + <span class="symbol"> + <span class="name">wait</span><span class="params">(<span name="arg0">arg0: <span class="extype" name="scala.Long">Long</span></span>)</span><span class="result">: <span class="extype" name="scala.Unit">Unit</span></span> + </span> + + <div class="fullcomment"><dl class="attributes block"> <dt>Definition Classes</dt><dd>AnyRef</dd><dt>Annotations</dt><dd> + <span class="name">@throws</span><span class="args">(<span> + + <span class="defval" name="classOf[java.lang.InterruptedException]">...</span> + </span>)</span> + + <span class="name">@native</span><span class="args">()</span> + + </dd></dl></div> + </li> + </ol> + </div> + + + + + </div> + + <div id="inheritedMembers"> + <div class="parent" name="scala.AnyRef"> + <h3>Inherited from <span class="extype" name="scala.AnyRef">AnyRef</span></h3> + </div><div class="parent" name="scala.Any"> + <h3>Inherited from <span class="extype" name="scala.Any">Any</span></h3> + </div> + + </div> + + <div id="groupedMembers"> + <div class="group" name="Ungrouped"> + <h3>Ungrouped</h3> + + </div> + </div> + + </div> + + <div id="tooltip"></div> + + <div id="footer"> </div> + </body> + </div> + </div> + </div> + </body> + </html>