davisusanibar commented on code in PR #341:
URL: https://github.com/apache/arrow-cookbook/pull/341#discussion_r1497763066


##########
java/source/substrait.rst:
##########
@@ -201,6 +201,189 @@ For example, we can join the nation and customer tables 
from the TPC-H benchmark
     N_NAME    NUMBER_CUSTOMER
     PERU    573
 
+Execute Filter and Projection into a Dataset
+============================================
+
+Dataset module is able to execute filters and projection with Substrait’s
+`Extended Expression`_. The substrait-java library is required in order to 
define
+our SQL Expression that we will pass to the dataset module.
+
+Filter Dataset
+--------------
+
+Here is an example of a Java program that filter columns from a Parquet file:
+
+- Loads a Parquet file containing the “nation” table from the TPC-H benchmark.
+- Applies a filter:
+    - `N_NATIONKEY > 10, AND`
+    - `N_NATIONKEY < 15`
+
+.. testcode::
+
+    import com.google.common.collect.ImmutableList;
+    import io.substrait.isthmus.SqlExpressionToSubstrait;
+    import io.substrait.proto.ExtendedExpression;
+    import java.nio.ByteBuffer;
+    import java.util.Base64;
+    import java.util.Optional;
+    import org.apache.arrow.dataset.file.FileFormat;
+    import org.apache.arrow.dataset.file.FileSystemDatasetFactory;
+    import org.apache.arrow.dataset.jni.NativeMemoryPool;
+    import org.apache.arrow.dataset.scanner.ScanOptions;
+    import org.apache.arrow.dataset.scanner.Scanner;
+    import org.apache.arrow.dataset.source.Dataset;
+    import org.apache.arrow.dataset.source.DatasetFactory;
+    import org.apache.arrow.memory.BufferAllocator;
+    import org.apache.arrow.memory.RootAllocator;
+    import org.apache.arrow.vector.ipc.ArrowReader;
+    import org.apache.calcite.sql.parser.SqlParseException;
+
+    ByteBuffer getFilterExpression() throws SqlParseException {
+      String sqlExpression = "N_NATIONKEY > 10 AND N_NATIONKEY < 15";
+      String nation =
+          "CREATE TABLE NATION (N_NATIONKEY INT NOT NULL, N_NAME CHAR(25), "
+              + "N_REGIONKEY INT NOT NULL, N_COMMENT VARCHAR)";
+      SqlExpressionToSubstrait expressionToSubstrait = new 
SqlExpressionToSubstrait();
+      ExtendedExpression expression =
+          expressionToSubstrait.convert(sqlExpression, 
ImmutableList.of(nation));
+      byte[] expressionToByte =
+          
Base64.getDecoder().decode(Base64.getEncoder().encodeToString(expression.toByteArray()));
+      ByteBuffer byteBuffer = 
ByteBuffer.allocateDirect(expressionToByte.length);
+      byteBuffer.put(expressionToByte);
+      return byteBuffer;
+    }
+
+    void filterDataset() throws SqlParseException {
+      String uri = "file:" + System.getProperty("user.dir") + 
"/thirdpartydeps/tpch/nation.parquet";
+      ScanOptions options =
+          new ScanOptions.Builder(/*batchSize*/ 32768)
+              .columns(Optional.empty())
+              .substraitFilter(getFilterExpression())
+              .build();
+      try (BufferAllocator allocator = new RootAllocator();
+          DatasetFactory datasetFactory =
+              new FileSystemDatasetFactory(
+                  allocator, NativeMemoryPool.getDefault(), 
FileFormat.PARQUET, uri);
+          Dataset dataset = datasetFactory.finish();
+          Scanner scanner = dataset.newScan(options);
+          ArrowReader reader = scanner.scanBatches()) {
+        while (reader.loadNextBatch()) {
+          System.out.print(reader.getVectorSchemaRoot().contentToTSVString());
+        }
+      } catch (Exception e) {
+        throw new RuntimeException(e);
+      }
+    }
+
+    filterDataset();
+
+.. testoutput::
+
+    n_nationkey    n_name    n_regionkey    n_comment
+    11    IRAQ    4    nic deposits boost atop the quickly final requests? 
quickly regula
+    12    JAPAN    2    ously. final, express gifts cajole a
+    13    JORDAN    4    ic deposits are blithely about the carefully regular 
pa
+    14    KENYA    0     pending excuses haggle furiously deposits. pending, 
express pinto beans wake fluffily past t
+
+Projection Dataset
+------------------
+
+The following Java program project a new column after applying a filter and
+projection definition into a Parquet file:
+
+- Loads a Parquet file containing the “nation” table from the TPC-H benchmark.
+- Applies a filter:
+ - `N_NATIONKEY > 10, AND`
+ - `N_NATIONKEY < 15`
+- Projects three new columns:
+ - `N_NAME`
+ - `N_NATIONKEY > 12`
+ - `N_NATIONKEY + 31`
+
+.. testcode::
+
+    import com.google.common.collect.ImmutableList;
+    import io.substrait.isthmus.SqlExpressionToSubstrait;
+    import io.substrait.proto.ExtendedExpression;
+    import java.nio.ByteBuffer;
+    import java.util.Base64;
+    import java.util.Optional;
+    import org.apache.arrow.dataset.file.FileFormat;
+    import org.apache.arrow.dataset.file.FileSystemDatasetFactory;
+    import org.apache.arrow.dataset.jni.NativeMemoryPool;
+    import org.apache.arrow.dataset.scanner.ScanOptions;
+    import org.apache.arrow.dataset.scanner.Scanner;
+    import org.apache.arrow.dataset.source.Dataset;
+    import org.apache.arrow.dataset.source.DatasetFactory;
+    import org.apache.arrow.memory.BufferAllocator;
+    import org.apache.arrow.memory.RootAllocator;
+    import org.apache.arrow.vector.ipc.ArrowReader;
+    import org.apache.calcite.sql.parser.SqlParseException;
+
+    ByteBuffer getProjectExpression() throws SqlParseException {
+      String[] sqlExpression = new String[]{"N_NAME", "N_NATIONKEY > 12", 
"N_NATIONKEY + 31"};
+      String nation =
+          "CREATE TABLE NATION (N_NATIONKEY INT NOT NULL, N_NAME CHAR(25), "
+              + "N_REGIONKEY INT NOT NULL, N_COMMENT VARCHAR)";
+      SqlExpressionToSubstrait expressionToSubstrait = new 
SqlExpressionToSubstrait();
+      ExtendedExpression expression =
+          expressionToSubstrait.convert(sqlExpression, 
ImmutableList.of(nation));
+      byte[] expressionToByte =
+          
Base64.getDecoder().decode(Base64.getEncoder().encodeToString(expression.toByteArray()));

Review Comment:
   deleted



##########
java/source/substrait.rst:
##########
@@ -201,6 +201,189 @@ For example, we can join the nation and customer tables 
from the TPC-H benchmark
     N_NAME    NUMBER_CUSTOMER
     PERU    573
 
+Execute Filter and Projection into a Dataset
+============================================
+
+Dataset module is able to execute filters and projection with Substrait’s
+`Extended Expression`_. The substrait-java library is required in order to 
define
+our SQL Expression that we will pass to the dataset module.
+
+Filter Dataset
+--------------
+
+Here is an example of a Java program that filter columns from a Parquet file:
+
+- Loads a Parquet file containing the “nation” table from the TPC-H benchmark.
+- Applies a filter:
+    - `N_NATIONKEY > 10, AND`
+    - `N_NATIONKEY < 15`
+
+.. testcode::
+
+    import com.google.common.collect.ImmutableList;
+    import io.substrait.isthmus.SqlExpressionToSubstrait;
+    import io.substrait.proto.ExtendedExpression;
+    import java.nio.ByteBuffer;
+    import java.util.Base64;
+    import java.util.Optional;
+    import org.apache.arrow.dataset.file.FileFormat;
+    import org.apache.arrow.dataset.file.FileSystemDatasetFactory;
+    import org.apache.arrow.dataset.jni.NativeMemoryPool;
+    import org.apache.arrow.dataset.scanner.ScanOptions;
+    import org.apache.arrow.dataset.scanner.Scanner;
+    import org.apache.arrow.dataset.source.Dataset;
+    import org.apache.arrow.dataset.source.DatasetFactory;
+    import org.apache.arrow.memory.BufferAllocator;
+    import org.apache.arrow.memory.RootAllocator;
+    import org.apache.arrow.vector.ipc.ArrowReader;
+    import org.apache.calcite.sql.parser.SqlParseException;
+
+    ByteBuffer getFilterExpression() throws SqlParseException {
+      String sqlExpression = "N_NATIONKEY > 10 AND N_NATIONKEY < 15";
+      String nation =
+          "CREATE TABLE NATION (N_NATIONKEY INT NOT NULL, N_NAME CHAR(25), "
+              + "N_REGIONKEY INT NOT NULL, N_COMMENT VARCHAR)";
+      SqlExpressionToSubstrait expressionToSubstrait = new 
SqlExpressionToSubstrait();
+      ExtendedExpression expression =
+          expressionToSubstrait.convert(sqlExpression, 
ImmutableList.of(nation));
+      byte[] expressionToByte =
+          
Base64.getDecoder().decode(Base64.getEncoder().encodeToString(expression.toByteArray()));
+      ByteBuffer byteBuffer = 
ByteBuffer.allocateDirect(expressionToByte.length);
+      byteBuffer.put(expressionToByte);
+      return byteBuffer;
+    }
+
+    void filterDataset() throws SqlParseException {
+      String uri = "file:" + System.getProperty("user.dir") + 
"/thirdpartydeps/tpch/nation.parquet";
+      ScanOptions options =
+          new ScanOptions.Builder(/*batchSize*/ 32768)
+              .columns(Optional.empty())
+              .substraitFilter(getFilterExpression())
+              .build();
+      try (BufferAllocator allocator = new RootAllocator();
+          DatasetFactory datasetFactory =
+              new FileSystemDatasetFactory(
+                  allocator, NativeMemoryPool.getDefault(), 
FileFormat.PARQUET, uri);
+          Dataset dataset = datasetFactory.finish();
+          Scanner scanner = dataset.newScan(options);
+          ArrowReader reader = scanner.scanBatches()) {
+        while (reader.loadNextBatch()) {
+          System.out.print(reader.getVectorSchemaRoot().contentToTSVString());
+        }
+      } catch (Exception e) {
+        throw new RuntimeException(e);
+      }
+    }
+
+    filterDataset();
+
+.. testoutput::
+
+    n_nationkey    n_name    n_regionkey    n_comment
+    11    IRAQ    4    nic deposits boost atop the quickly final requests? 
quickly regula
+    12    JAPAN    2    ously. final, express gifts cajole a
+    13    JORDAN    4    ic deposits are blithely about the carefully regular 
pa
+    14    KENYA    0     pending excuses haggle furiously deposits. pending, 
express pinto beans wake fluffily past t
+
+Projection Dataset
+------------------
+
+The following Java program project a new column after applying a filter and
+projection definition into a Parquet file:
+
+- Loads a Parquet file containing the “nation” table from the TPC-H benchmark.
+- Applies a filter:
+ - `N_NATIONKEY > 10, AND`
+ - `N_NATIONKEY < 15`
+- Projects three new columns:
+ - `N_NAME`
+ - `N_NATIONKEY > 12`
+ - `N_NATIONKEY + 31`
+
+.. testcode::
+
+    import com.google.common.collect.ImmutableList;
+    import io.substrait.isthmus.SqlExpressionToSubstrait;
+    import io.substrait.proto.ExtendedExpression;
+    import java.nio.ByteBuffer;
+    import java.util.Base64;
+    import java.util.Optional;
+    import org.apache.arrow.dataset.file.FileFormat;
+    import org.apache.arrow.dataset.file.FileSystemDatasetFactory;
+    import org.apache.arrow.dataset.jni.NativeMemoryPool;
+    import org.apache.arrow.dataset.scanner.ScanOptions;
+    import org.apache.arrow.dataset.scanner.Scanner;
+    import org.apache.arrow.dataset.source.Dataset;
+    import org.apache.arrow.dataset.source.DatasetFactory;
+    import org.apache.arrow.memory.BufferAllocator;
+    import org.apache.arrow.memory.RootAllocator;
+    import org.apache.arrow.vector.ipc.ArrowReader;
+    import org.apache.calcite.sql.parser.SqlParseException;
+
+    ByteBuffer getProjectExpression() throws SqlParseException {
+      String[] sqlExpression = new String[]{"N_NAME", "N_NATIONKEY > 12", 
"N_NATIONKEY + 31"};
+      String nation =
+          "CREATE TABLE NATION (N_NATIONKEY INT NOT NULL, N_NAME CHAR(25), "
+              + "N_REGIONKEY INT NOT NULL, N_COMMENT VARCHAR)";
+      SqlExpressionToSubstrait expressionToSubstrait = new 
SqlExpressionToSubstrait();
+      ExtendedExpression expression =
+          expressionToSubstrait.convert(sqlExpression, 
ImmutableList.of(nation));
+      byte[] expressionToByte =
+          
Base64.getDecoder().decode(Base64.getEncoder().encodeToString(expression.toByteArray()));
+      ByteBuffer byteBuffer = 
ByteBuffer.allocateDirect(expressionToByte.length);
+      byteBuffer.put(expressionToByte);
+      return byteBuffer;
+    }
+
+    ByteBuffer getFilterExpression() throws SqlParseException {
+      String sqlExpression = "N_NATIONKEY > 10 AND N_NATIONKEY < 15";
+      String nation =
+          "CREATE TABLE NATION (N_NATIONKEY INT NOT NULL, N_NAME CHAR(25), "
+              + "N_REGIONKEY INT NOT NULL, N_COMMENT VARCHAR)";
+      SqlExpressionToSubstrait expressionToSubstrait = new 
SqlExpressionToSubstrait();
+      ExtendedExpression expression =
+          expressionToSubstrait.convert(sqlExpression, 
ImmutableList.of(nation));
+      byte[] expressionToByte =
+          
Base64.getDecoder().decode(Base64.getEncoder().encodeToString(expression.toByteArray()));

Review Comment:
   deleted



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to