This is an automated email from the ASF dual-hosted git repository.

jrmccluskey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git


The following commit(s) were added to refs/heads/master by this push:
     new 22ac580f827 [#38059] Fix GCS glob matching to support ** and / in 
object names (#38099)
22ac580f827 is described below

commit 22ac580f827961d6f341e3bb031f4c89d9bc0166
Author: innuendo <[email protected]>
AuthorDate: Fri Jun 12 22:14:39 2026 +0200

    [#38059] Fix GCS glob matching to support ** and / in object names (#38099)
    
    * fix GCS filesystem glob matching to handle / in object names and support 
**
    
    * update CHANGES.md
    
    * update CHANGES.md
    
    * address comments
    
    * Update sdks/go/pkg/beam/io/filesystem/gcs/gcs.go
    
    Co-authored-by: gemini-code-assist[bot] 
<176961590+gemini-code-assist[bot]@users.noreply.github.com>
    
    * modify beam_PostCommit_Go.json as required
    
    ---------
    
    Co-authored-by: [email protected] <[email protected]>
    Co-authored-by: gemini-code-assist[bot] 
<176961590+gemini-code-assist[bot]@users.noreply.github.com>
    Co-authored-by: Jack McCluskey 
<[email protected]>
---
 .github/trigger_files/beam_PostCommit_Go.json  |   2 +-
 CHANGES.md                                     |   2 +
 sdks/go/pkg/beam/io/filesystem/gcs/gcs.go      |  88 ++++++++++++++-
 sdks/go/pkg/beam/io/filesystem/gcs/gcs_test.go | 146 +++++++++++++++++++++++++
 4 files changed, 231 insertions(+), 7 deletions(-)

diff --git a/.github/trigger_files/beam_PostCommit_Go.json 
b/.github/trigger_files/beam_PostCommit_Go.json
index b73af5e61a4..7ab7bcd9a9c 100644
--- a/.github/trigger_files/beam_PostCommit_Go.json
+++ b/.github/trigger_files/beam_PostCommit_Go.json
@@ -1,4 +1,4 @@
 {
     "comment": "Modify this file in a trivial way to cause this test suite to 
run.",
-    "modification": 1
+    "modification": 2
 }
diff --git a/CHANGES.md b/CHANGES.md
index 7e8e55e3d12..c70fef0cf8e 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -85,6 +85,8 @@
 
 ## Bugfixes
 
+* Fixed GCS filesystem glob matching to correctly handle `/` in object names 
and support `**` for recursive matching (Go) 
([#38059](https://github.com/apache/beam/issues/38059)).
+* Fixed BigQueryEnrichmentHandler batch mode dropping earlier requests when 
multiple requests share the same enrichment key (Python) 
([#38035](https://github.com/apache/beam/issues/38035)).
 * Fixed IcebergIO writing manifest column bounds padded with trailing `0x00` 
bytes, which broke equality predicate pushdown in some query engines (Java) 
([#38580](https://github.com/apache/beam/issues/38580)).
 
 ## Security Fixes
diff --git a/sdks/go/pkg/beam/io/filesystem/gcs/gcs.go 
b/sdks/go/pkg/beam/io/filesystem/gcs/gcs.go
index 73e68638105..8d6ce28559f 100644
--- a/sdks/go/pkg/beam/io/filesystem/gcs/gcs.go
+++ b/sdks/go/pkg/beam/io/filesystem/gcs/gcs.go
@@ -21,7 +21,8 @@ import (
        "context"
        "fmt"
        "io"
-       "path/filepath"
+       "regexp"
+       "strings"
        "time"
 
        "cloud.google.com/go/storage"
@@ -38,6 +39,76 @@ const (
        projectBillingHook = "beam:go:hook:filesystem:billingproject"
 )
 
+// globToRegex translates a glob pattern to a regular expression.
+// It differs from filepath.Match in that:
+//   - / is treated as a regular character (not a separator), since GCS object
+//     names are flat with / being just another character
+//   - ** matches any sequence of characters including / (zero or more)
+//   - **/  matches zero or more path segments (e.g., "" or "dir/" or 
"dir/subdir/")
+//   - * matches any sequence of characters except / (zero or more)
+//   - ? matches any single character except /
+//
+// This matches the behavior of the Python and Java SDKs.
+func globToRegex(pattern string) (*regexp.Regexp, error) {
+       var result strings.Builder
+       result.WriteString("^")
+
+       for i := 0; i < len(pattern); i++ {
+               c := pattern[i]
+               switch c {
+               case '*':
+                       // Check for ** (double asterisk)
+                       if i+1 < len(pattern) && pattern[i+1] == '*' {
+                               // Check if followed by / (e.g., "**/" matches 
zero or more path segments)
+                               if i+2 < len(pattern) && pattern[i+2] == '/' {
+                                       // **/ matches "" or "something/" or 
"a/b/c/"
+                                       result.WriteString("(?:.*/)?")
+                                       i += 2 // Skip the second * and the /
+                               } else {
+                                       // ** at end or before non-slash 
matches any characters
+                                       result.WriteString(".*")
+                                       i++ // Skip the second *
+                               }
+                       } else {
+                               result.WriteString("[^/]*")
+                       }
+               case '?':
+                       result.WriteString("[^/]")
+               case '[':
+                       // Character class - find the closing bracket
+                       j := i + 1
+                       if j < len(pattern) && pattern[j] == '!' {
+                               j++
+                       }
+                       if j < len(pattern) && pattern[j] == ']' {
+                               j++
+                       }
+                       for j < len(pattern) && pattern[j] != ']' {
+                               j++
+                       }
+                       if j >= len(pattern) {
+                               return nil, fmt.Errorf("syntax error: unclosed 
'[' in pattern %q", pattern)
+                       } else {
+                               // Copy the character class, converting ! to ^ 
for negation
+                               result.WriteByte('[')
+                               content := pattern[i+1 : j]
+                               if len(content) > 0 && content[0] == '!' {
+                                       result.WriteByte('^')
+                                       content = content[1:]
+                               }
+                               result.WriteString(content)
+                               result.WriteByte(']')
+                               i = j
+                       }
+               default:
+                       result.WriteString(regexp.QuoteMeta(string(c)))
+               }
+       }
+
+       result.WriteString("$") // match end
+       return regexp.Compile(result.String())
+}
+
 var billingProject string = ""
 
 func init() {
@@ -107,6 +178,15 @@ func (f *fs) List(ctx context.Context, glob string) 
([]string, error) {
                return nil, err
        }
 
+       // Compile the glob pattern to a regex. We use a custom glob-to-regex
+       // translation that treats / as a regular character (not a separator),
+       // since GCS object names are flat. This also supports ** for recursive
+       // matching, similar to the Java and Python SDKs.
+       re, err := globToRegex(object)
+       if err != nil {
+               return nil, fmt.Errorf("invalid glob pattern %q: %w", object, 
err)
+       }
+
        var candidates []string
 
        // We handle globs by list all candidates and matching them here.
@@ -125,11 +205,7 @@ func (f *fs) List(ctx context.Context, glob string) 
([]string, error) {
                        return nil, err
                }
 
-               match, err := filepath.Match(object, obj.Name)
-               if err != nil {
-                       return nil, err
-               }
-               if match {
+               if re.MatchString(obj.Name) {
                        candidates = append(candidates, obj.Name)
                }
        }
diff --git a/sdks/go/pkg/beam/io/filesystem/gcs/gcs_test.go 
b/sdks/go/pkg/beam/io/filesystem/gcs/gcs_test.go
index 66dee6bb23f..cd6aab2a236 100644
--- a/sdks/go/pkg/beam/io/filesystem/gcs/gcs_test.go
+++ b/sdks/go/pkg/beam/io/filesystem/gcs/gcs_test.go
@@ -19,6 +19,7 @@ import (
        "context"
        "io"
        "sort"
+       "strings"
        "testing"
        "time"
 
@@ -271,6 +272,151 @@ func TestGCS_copy(t *testing.T) {
        }
 }
 
+func TestGlobToRegex(t *testing.T) {
+       tests := []struct {
+               pattern string
+               name    string
+               want    bool
+       }{
+               // Single * should NOT match / in object names
+               {"*.txt", "file.txt", true},
+               {"*.txt", "dir/file.txt", false},
+               {"prefix*", "prefix123", true},
+               {"prefix*", "prefix/subdir", false},
+
+               // ** should match any characters including /
+               {"**", "file.txt", true},
+               {"**", "dir/file.txt", true},
+               {"**", "dir/subdir/file.txt", true},
+               {"prefix/**", "prefix/file.txt", true},
+               {"prefix/**", "prefix/subdir/file.txt", true},
+               {"**/file.txt", "file.txt", true},
+               {"**/file.txt", "dir/file.txt", true},
+               {"**/file.txt", "dir/subdir/file.txt", true},
+
+               // Mixed patterns
+               {"dir/*.txt", "dir/file.txt", true},
+               {"dir/*.txt", "dir/subdir/file.txt", false},
+               {"dir/**/*.txt", "dir/file.txt", true},
+               {"dir/**/*.txt", "dir/subdir/file.txt", true},
+               {"dir/**/file.txt", "dir/file.txt", true},
+               {"dir/**/file.txt", "dir/a/b/c/file.txt", true},
+
+               // ? should match any single character except /
+               {"file?.txt", "file1.txt", true},
+               {"file?.txt", "file12.txt", false},
+               {"file?.txt", "file/.txt", false}, // ? should not cross /
+               {"dir?file.txt", "dir/file.txt", false},
+
+               // Character classes
+               {"file[0-9].txt", "file1.txt", true},
+               {"file[0-9].txt", "filea.txt", false},
+               {"file[!0-9].txt", "filea.txt", true},
+               {"file[!0-9].txt", "file1.txt", false},
+
+               // Exact match (no wildcards)
+               {"exact.txt", "exact.txt", true},
+               {"exact.txt", "notexact.txt", false},
+
+               // Regex special characters should be escaped
+               {"file.txt", "file.txt", true},
+               {"file.txt", "fileXtxt", false},
+               {"file(1).txt", "file(1).txt", true},
+       }
+
+       for _, tt := range tests {
+               t.Run(tt.pattern+"_"+tt.name, func(t *testing.T) {
+                       re, err := globToRegex(tt.pattern)
+                       if err != nil {
+                               t.Fatalf("globToRegex(%q) error = %v", 
tt.pattern, err)
+                       }
+                       got := re.MatchString(tt.name)
+                       if got != tt.want {
+                               t.Errorf("globToRegex(%q).MatchString(%q) = %v, 
want %v", tt.pattern, tt.name, got, tt.want)
+                       }
+               })
+       }
+}
+
+func TestGlobToRegex_errors(t *testing.T) {
+       tests := []struct {
+               pattern string
+               wantErr string
+       }{
+               {"file[abc.txt", "unclosed '['"},
+               {"[invalid", "unclosed '['"},
+       }
+
+       for _, tt := range tests {
+               t.Run(tt.pattern, func(t *testing.T) {
+                       _, err := globToRegex(tt.pattern)
+                       if err == nil {
+                               t.Errorf("globToRegex(%q) expected error 
containing %q, got nil", tt.pattern, tt.wantErr)
+                       } else if !strings.Contains(err.Error(), tt.wantErr) {
+                               t.Errorf("globToRegex(%q) error = %v, want 
error containing %q", tt.pattern, err, tt.wantErr)
+                       }
+               })
+       }
+}
+
+func TestGCS_listWithSlashesInObjectNames(t *testing.T) {
+       ctx := context.Background()
+       bucket := "beamgogcsfilesystemtest"
+       dirPath := "gs://" + bucket
+
+       // Create server with objects that have / in their names
+       server := fakestorage.NewServer([]fakestorage.Object{
+               {ObjectAttrs: fakestorage.ObjectAttrs{BucketName: bucket, Name: 
"file.txt"}, Content: []byte("")},
+               {ObjectAttrs: fakestorage.ObjectAttrs{BucketName: bucket, Name: 
"dir/file.txt"}, Content: []byte("")},
+               {ObjectAttrs: fakestorage.ObjectAttrs{BucketName: bucket, Name: 
"dir/subdir/file.txt"}, Content: []byte("")},
+               {ObjectAttrs: fakestorage.ObjectAttrs{BucketName: bucket, Name: 
"other.txt"}, Content: []byte("")},
+       })
+       t.Cleanup(server.Stop)
+       c := &fs{client: server.Client()}
+
+       tests := []struct {
+               glob string
+               want []string
+       }{
+               // Single * should only match top-level files
+               {dirPath + "/*.txt", []string{dirPath + "/file.txt", dirPath + 
"/other.txt"}},
+               // ** should match all files recursively
+               {dirPath + "/**", []string{
+                       dirPath + "/file.txt",
+                       dirPath + "/dir/file.txt",
+                       dirPath + "/dir/subdir/file.txt",
+                       dirPath + "/other.txt",
+               }},
+               // dir/* should only match immediate children
+               {dirPath + "/dir/*", []string{dirPath + "/dir/file.txt"}},
+               // dir/** should match all descendants
+               {dirPath + "/dir/**", []string{
+                       dirPath + "/dir/file.txt",
+                       dirPath + "/dir/subdir/file.txt",
+               }},
+               // Deeply nested ** matching (core scenario from issue #38059)
+               {dirPath + "/dir/subdir/**", []string{
+                       dirPath + "/dir/subdir/file.txt",
+               }},
+       }
+
+       for _, tt := range tests {
+               t.Run(tt.glob, func(t *testing.T) {
+                       got, err := c.List(ctx, tt.glob)
+                       if err != nil {
+                               t.Fatalf("List(%q) error = %v", tt.glob, err)
+                       }
+
+                       sort.Strings(got)
+                       sort.Strings(tt.want)
+
+                       if !cmp.Equal(got, tt.want) {
+                               t.Errorf("List(%q) = %v, want %v", tt.glob, 
got, tt.want)
+                       }
+               })
+       }
+}
+
 func createFakeGCSServer(tb testing.TB) *fakestorage.Server {
        tb.Helper()
 

Reply via email to