Re: [PR] Add BigTableRead connector and new feature implemented [beam]

via GitHub Mon, 28 Jul 2025 08:21:25 -0700


arnavarora2004 commented on code in PR #35696:
URL: https://github.com/apache/beam/pull/35696#discussion_r2236927390



##########
sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableReadSchemaTransformProviderIT.java:
##########
@@ -136,95 +135,94 @@ public void tearDown() {
     tableAdminClient.close();
   }
 
-  public List<Row> writeToTable(int numRows) {
-    List<Row> expectedRows = new ArrayList<>();
-
-    try {
-      for (int i = 1; i <= numRows; i++) {
-        String key = "key" + i;
-        String valueA = "value a" + i;
-        String valueB = "value b" + i;
-        String valueC = "value c" + i;
-        String valueD = "value d" + i;
-        long timestamp = 1000L * i;
-
-        RowMutation rowMutation =
-            RowMutation.create(tableId, key)
-                .setCell(COLUMN_FAMILY_NAME_1, "a", timestamp, valueA)
-                .setCell(COLUMN_FAMILY_NAME_1, "b", timestamp, valueB)
-                .setCell(COLUMN_FAMILY_NAME_2, "c", timestamp, valueC)
-                .setCell(COLUMN_FAMILY_NAME_2, "d", timestamp, valueD);
-        dataClient.mutateRow(rowMutation);
-
-        // Set up expected Beam Row
-        Map<String, List<Row>> columns1 = new HashMap<>();
-        columns1.put(
-            "a",
-            Arrays.asList(
-                Row.withSchema(CELL_SCHEMA)
-                    .withFieldValue(
-                        "value", 
ByteBuffer.wrap(valueA.getBytes(StandardCharsets.UTF_8)))
-                    .withFieldValue("timestamp_micros", timestamp)
-                    .build()));
-        columns1.put(
-            "b",
-            Arrays.asList(
-                Row.withSchema(CELL_SCHEMA)
-                    .withFieldValue(
-                        "value", 
ByteBuffer.wrap(valueB.getBytes(StandardCharsets.UTF_8)))
-                    .withFieldValue("timestamp_micros", timestamp)
-                    .build()));
-
-        Map<String, List<Row>> columns2 = new HashMap<>();
-        columns2.put(
-            "c",
-            Arrays.asList(
-                Row.withSchema(CELL_SCHEMA)
-                    .withFieldValue(
-                        "value", 
ByteBuffer.wrap(valueC.getBytes(StandardCharsets.UTF_8)))
-                    .withFieldValue("timestamp_micros", timestamp)
-                    .build()));
-        columns2.put(
-            "d",
-            Arrays.asList(
-                Row.withSchema(CELL_SCHEMA)
-                    .withFieldValue(
-                        "value", 
ByteBuffer.wrap(valueD.getBytes(StandardCharsets.UTF_8)))
-                    .withFieldValue("timestamp_micros", timestamp)
-                    .build()));
-
-        Map<String, Map<String, List<Row>>> families = new HashMap<>();
-        families.put(COLUMN_FAMILY_NAME_1, columns1);
-        families.put(COLUMN_FAMILY_NAME_2, columns2);
-
-        Row expectedRow =
-            Row.withSchema(ROW_SCHEMA)
-                .withFieldValue("key", 
ByteBuffer.wrap(key.getBytes(StandardCharsets.UTF_8)))
-                .withFieldValue("column_families", families)
-                .build();
-
-        expectedRows.add(expectedRow);
-      }
-      LOG.info("Finished writing {} rows to table {}", numRows, tableId);
-    } catch (NotFoundException e) {
-      throw new RuntimeException("Failed to write to table", e);
-    }
-    return expectedRows;
-  }
-
   @Test
   public void testRead() {
-    List<Row> expectedRows = writeToTable(20);
+    int numRows = 20;
+    List<Row> expectedRows = new ArrayList<>();
+    for (int i = 1; i <= numRows; i++) {
+      String key = "key" + i;
+      byte[] keyBytes = key.getBytes(StandardCharsets.UTF_8);
+      String valueA = "value a" + i;
+      byte[] valueABytes = valueA.getBytes(StandardCharsets.UTF_8);
+      String valueB = "value b" + i;
+      byte[] valueBBytes = valueB.getBytes(StandardCharsets.UTF_8);
+      String valueC = "value c" + i;
+      byte[] valueCBytes = valueC.getBytes(StandardCharsets.UTF_8);
+      String valueD = "value d" + i;
+      byte[] valueDBytes = valueD.getBytes(StandardCharsets.UTF_8);
+      long timestamp = 1000L * i;
+
+      RowMutation rowMutation =
+          RowMutation.create(tableId, key)
+              .setCell(COLUMN_FAMILY_NAME_1, "a", timestamp, valueA)
+              .setCell(COLUMN_FAMILY_NAME_1, "b", timestamp, valueB)
+              .setCell(COLUMN_FAMILY_NAME_2, "c", timestamp, valueC)
+              .setCell(COLUMN_FAMILY_NAME_2, "d", timestamp, valueD);
+      dataClient.mutateRow(rowMutation);
+
+      // Set up expected Beam Row
+      // FIX: Use byte[] instead of ByteBuffer
+      Map<String, List<Row>> columns1 = new HashMap<>();
+      columns1.put(
+          "a",
+          Arrays.asList(
+              Row.withSchema(CELL_SCHEMA)
+                  .withFieldValue("value", valueABytes)
+                  .withFieldValue("timestamp_micros", timestamp)
+                  .build()));
+      columns1.put(
+          "b",
+          Arrays.asList(
+              Row.withSchema(CELL_SCHEMA)
+                  .withFieldValue("value", valueBBytes)
+                  .withFieldValue("timestamp_micros", timestamp)
+                  .build()));
+
+      Map<String, List<Row>> columns2 = new HashMap<>();
+      columns2.put(
+          "c",
+          Arrays.asList(
+              Row.withSchema(CELL_SCHEMA)
+                  .withFieldValue("value", valueCBytes)
+                  .withFieldValue("timestamp_micros", timestamp)
+                  .build()));
+      columns2.put(
+          "d",
+          Arrays.asList(
+              Row.withSchema(CELL_SCHEMA)
+                  .withFieldValue("value", valueDBytes)
+                  .withFieldValue("timestamp_micros", timestamp)
+                  .build()));
+
+      Map<String, Map<String, List<Row>>> families = new HashMap<>();
+      families.put(COLUMN_FAMILY_NAME_1, columns1);
+      families.put(COLUMN_FAMILY_NAME_2, columns2);
+
+      Row expectedRow =
+          Row.withSchema(ROW_SCHEMA)
+              .withFieldValue("key", keyBytes)
+              .withFieldValue("column_families", families)
+              .build();
+
+      expectedRows.add(expectedRow);
+    }
+    LOG.info("Finished writing {} rows to table {}", numRows, tableId);
 
+    // FIX: Explicitly set flatten to false to match the expected nested rows.
     BigtableReadSchemaTransformConfiguration config =
         BigtableReadSchemaTransformConfiguration.builder()
             .setTableId(tableId)
             .setInstanceId(instanceId)
             .setProjectId(projectId)
+            .setFlatten(false)

Review Comment:
   yes did that,



##########
sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableReadSchemaTransformProvider.java:
##########
@@ -152,45 +166,129 @@ public PCollectionRowTuple expand(PCollectionRowTuple 
input) {
                       .withInstanceId(configuration.getInstanceId())
                       .withProjectId(configuration.getProjectId()));
 
+      // ParDo fucntion implements fork logic if flatten == True
+
+      // Determine the output schema based on the flatten configuration.
+      // The default for flatten is true.

Review Comment:
   sounds good



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@beam.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Re: [PR] Add BigTableRead connector and new feature implemented [beam]

Reply via email to