This is an automated email from the ASF dual-hosted git repository.
jrmccluskey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git
The following commit(s) were added to refs/heads/master by this push:
new 22ac580f827 [#38059] Fix GCS glob matching to support ** and / in
object names (#38099)
22ac580f827 is described below
commit 22ac580f827961d6f341e3bb031f4c89d9bc0166
Author: innuendo <[email protected]>
AuthorDate: Fri Jun 12 22:14:39 2026 +0200
[#38059] Fix GCS glob matching to support ** and / in object names (#38099)
* fix GCS filesystem glob matching to handle / in object names and support
**
* update CHANGES.md
* update CHANGES.md
* address comments
* Update sdks/go/pkg/beam/io/filesystem/gcs/gcs.go
Co-authored-by: gemini-code-assist[bot]
<176961590+gemini-code-assist[bot]@users.noreply.github.com>
* modify beam_PostCommit_Go.json as required
---------
Co-authored-by: [email protected] <[email protected]>
Co-authored-by: gemini-code-assist[bot]
<176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Jack McCluskey
<[email protected]>
---
.github/trigger_files/beam_PostCommit_Go.json | 2 +-
CHANGES.md | 2 +
sdks/go/pkg/beam/io/filesystem/gcs/gcs.go | 88 ++++++++++++++-
sdks/go/pkg/beam/io/filesystem/gcs/gcs_test.go | 146 +++++++++++++++++++++++++
4 files changed, 231 insertions(+), 7 deletions(-)
diff --git a/.github/trigger_files/beam_PostCommit_Go.json
b/.github/trigger_files/beam_PostCommit_Go.json
index b73af5e61a4..7ab7bcd9a9c 100644
--- a/.github/trigger_files/beam_PostCommit_Go.json
+++ b/.github/trigger_files/beam_PostCommit_Go.json
@@ -1,4 +1,4 @@
{
"comment": "Modify this file in a trivial way to cause this test suite to
run.",
- "modification": 1
+ "modification": 2
}
diff --git a/CHANGES.md b/CHANGES.md
index 7e8e55e3d12..c70fef0cf8e 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -85,6 +85,8 @@
## Bugfixes
+* Fixed GCS filesystem glob matching to correctly handle `/` in object names
and support `**` for recursive matching (Go)
([#38059](https://github.com/apache/beam/issues/38059)).
+* Fixed BigQueryEnrichmentHandler batch mode dropping earlier requests when
multiple requests share the same enrichment key (Python)
([#38035](https://github.com/apache/beam/issues/38035)).
* Fixed IcebergIO writing manifest column bounds padded with trailing `0x00`
bytes, which broke equality predicate pushdown in some query engines (Java)
([#38580](https://github.com/apache/beam/issues/38580)).
## Security Fixes
diff --git a/sdks/go/pkg/beam/io/filesystem/gcs/gcs.go
b/sdks/go/pkg/beam/io/filesystem/gcs/gcs.go
index 73e68638105..8d6ce28559f 100644
--- a/sdks/go/pkg/beam/io/filesystem/gcs/gcs.go
+++ b/sdks/go/pkg/beam/io/filesystem/gcs/gcs.go
@@ -21,7 +21,8 @@ import (
"context"
"fmt"
"io"
- "path/filepath"
+ "regexp"
+ "strings"
"time"
"cloud.google.com/go/storage"
@@ -38,6 +39,76 @@ const (
projectBillingHook = "beam:go:hook:filesystem:billingproject"
)
+// globToRegex translates a glob pattern to a regular expression.
+// It differs from filepath.Match in that:
+// - / is treated as a regular character (not a separator), since GCS object
+// names are flat with / being just another character
+// - ** matches any sequence of characters including / (zero or more)
+// - **/ matches zero or more path segments (e.g., "" or "dir/" or
"dir/subdir/")
+// - * matches any sequence of characters except / (zero or more)
+// - ? matches any single character except /
+//
+// This matches the behavior of the Python and Java SDKs.
+func globToRegex(pattern string) (*regexp.Regexp, error) {
+ var result strings.Builder
+ result.WriteString("^")
+
+ for i := 0; i < len(pattern); i++ {
+ c := pattern[i]
+ switch c {
+ case '*':
+ // Check for ** (double asterisk)
+ if i+1 < len(pattern) && pattern[i+1] == '*' {
+ // Check if followed by / (e.g., "**/" matches
zero or more path segments)
+ if i+2 < len(pattern) && pattern[i+2] == '/' {
+ // **/ matches "" or "something/" or
"a/b/c/"
+ result.WriteString("(?:.*/)?")
+ i += 2 // Skip the second * and the /
+ } else {
+ // ** at end or before non-slash
matches any characters
+ result.WriteString(".*")
+ i++ // Skip the second *
+ }
+ } else {
+ result.WriteString("[^/]*")
+ }
+ case '?':
+ result.WriteString("[^/]")
+ case '[':
+ // Character class - find the closing bracket
+ j := i + 1
+ if j < len(pattern) && pattern[j] == '!' {
+ j++
+ }
+ if j < len(pattern) && pattern[j] == ']' {
+ j++
+ }
+ for j < len(pattern) && pattern[j] != ']' {
+ j++
+ }
+ if j >= len(pattern) {
+ return nil, fmt.Errorf("syntax error: unclosed
'[' in pattern %q", pattern)
+ } else {
+ // Copy the character class, converting ! to ^
for negation
+ result.WriteByte('[')
+ content := pattern[i+1 : j]
+ if len(content) > 0 && content[0] == '!' {
+ result.WriteByte('^')
+ content = content[1:]
+ }
+ result.WriteString(content)
+ result.WriteByte(']')
+ i = j
+ }
+ default:
+ result.WriteString(regexp.QuoteMeta(string(c)))
+ }
+ }
+
+ result.WriteString("$") // match end
+ return regexp.Compile(result.String())
+}
+
var billingProject string = ""
func init() {
@@ -107,6 +178,15 @@ func (f *fs) List(ctx context.Context, glob string)
([]string, error) {
return nil, err
}
+ // Compile the glob pattern to a regex. We use a custom glob-to-regex
+ // translation that treats / as a regular character (not a separator),
+ // since GCS object names are flat. This also supports ** for recursive
+ // matching, similar to the Java and Python SDKs.
+ re, err := globToRegex(object)
+ if err != nil {
+ return nil, fmt.Errorf("invalid glob pattern %q: %w", object,
err)
+ }
+
var candidates []string
// We handle globs by list all candidates and matching them here.
@@ -125,11 +205,7 @@ func (f *fs) List(ctx context.Context, glob string)
([]string, error) {
return nil, err
}
- match, err := filepath.Match(object, obj.Name)
- if err != nil {
- return nil, err
- }
- if match {
+ if re.MatchString(obj.Name) {
candidates = append(candidates, obj.Name)
}
}
diff --git a/sdks/go/pkg/beam/io/filesystem/gcs/gcs_test.go
b/sdks/go/pkg/beam/io/filesystem/gcs/gcs_test.go
index 66dee6bb23f..cd6aab2a236 100644
--- a/sdks/go/pkg/beam/io/filesystem/gcs/gcs_test.go
+++ b/sdks/go/pkg/beam/io/filesystem/gcs/gcs_test.go
@@ -19,6 +19,7 @@ import (
"context"
"io"
"sort"
+ "strings"
"testing"
"time"
@@ -271,6 +272,151 @@ func TestGCS_copy(t *testing.T) {
}
}
+func TestGlobToRegex(t *testing.T) {
+ tests := []struct {
+ pattern string
+ name string
+ want bool
+ }{
+ // Single * should NOT match / in object names
+ {"*.txt", "file.txt", true},
+ {"*.txt", "dir/file.txt", false},
+ {"prefix*", "prefix123", true},
+ {"prefix*", "prefix/subdir", false},
+
+ // ** should match any characters including /
+ {"**", "file.txt", true},
+ {"**", "dir/file.txt", true},
+ {"**", "dir/subdir/file.txt", true},
+ {"prefix/**", "prefix/file.txt", true},
+ {"prefix/**", "prefix/subdir/file.txt", true},
+ {"**/file.txt", "file.txt", true},
+ {"**/file.txt", "dir/file.txt", true},
+ {"**/file.txt", "dir/subdir/file.txt", true},
+
+ // Mixed patterns
+ {"dir/*.txt", "dir/file.txt", true},
+ {"dir/*.txt", "dir/subdir/file.txt", false},
+ {"dir/**/*.txt", "dir/file.txt", true},
+ {"dir/**/*.txt", "dir/subdir/file.txt", true},
+ {"dir/**/file.txt", "dir/file.txt", true},
+ {"dir/**/file.txt", "dir/a/b/c/file.txt", true},
+
+ // ? should match any single character except /
+ {"file?.txt", "file1.txt", true},
+ {"file?.txt", "file12.txt", false},
+ {"file?.txt", "file/.txt", false}, // ? should not cross /
+ {"dir?file.txt", "dir/file.txt", false},
+
+ // Character classes
+ {"file[0-9].txt", "file1.txt", true},
+ {"file[0-9].txt", "filea.txt", false},
+ {"file[!0-9].txt", "filea.txt", true},
+ {"file[!0-9].txt", "file1.txt", false},
+
+ // Exact match (no wildcards)
+ {"exact.txt", "exact.txt", true},
+ {"exact.txt", "notexact.txt", false},
+
+ // Regex special characters should be escaped
+ {"file.txt", "file.txt", true},
+ {"file.txt", "fileXtxt", false},
+ {"file(1).txt", "file(1).txt", true},
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.pattern+"_"+tt.name, func(t *testing.T) {
+ re, err := globToRegex(tt.pattern)
+ if err != nil {
+ t.Fatalf("globToRegex(%q) error = %v",
tt.pattern, err)
+ }
+ got := re.MatchString(tt.name)
+ if got != tt.want {
+ t.Errorf("globToRegex(%q).MatchString(%q) = %v,
want %v", tt.pattern, tt.name, got, tt.want)
+ }
+ })
+ }
+}
+
+func TestGlobToRegex_errors(t *testing.T) {
+ tests := []struct {
+ pattern string
+ wantErr string
+ }{
+ {"file[abc.txt", "unclosed '['"},
+ {"[invalid", "unclosed '['"},
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.pattern, func(t *testing.T) {
+ _, err := globToRegex(tt.pattern)
+ if err == nil {
+ t.Errorf("globToRegex(%q) expected error
containing %q, got nil", tt.pattern, tt.wantErr)
+ } else if !strings.Contains(err.Error(), tt.wantErr) {
+ t.Errorf("globToRegex(%q) error = %v, want
error containing %q", tt.pattern, err, tt.wantErr)
+ }
+ })
+ }
+}
+
+func TestGCS_listWithSlashesInObjectNames(t *testing.T) {
+ ctx := context.Background()
+ bucket := "beamgogcsfilesystemtest"
+ dirPath := "gs://" + bucket
+
+ // Create server with objects that have / in their names
+ server := fakestorage.NewServer([]fakestorage.Object{
+ {ObjectAttrs: fakestorage.ObjectAttrs{BucketName: bucket, Name:
"file.txt"}, Content: []byte("")},
+ {ObjectAttrs: fakestorage.ObjectAttrs{BucketName: bucket, Name:
"dir/file.txt"}, Content: []byte("")},
+ {ObjectAttrs: fakestorage.ObjectAttrs{BucketName: bucket, Name:
"dir/subdir/file.txt"}, Content: []byte("")},
+ {ObjectAttrs: fakestorage.ObjectAttrs{BucketName: bucket, Name:
"other.txt"}, Content: []byte("")},
+ })
+ t.Cleanup(server.Stop)
+ c := &fs{client: server.Client()}
+
+ tests := []struct {
+ glob string
+ want []string
+ }{
+ // Single * should only match top-level files
+ {dirPath + "/*.txt", []string{dirPath + "/file.txt", dirPath +
"/other.txt"}},
+ // ** should match all files recursively
+ {dirPath + "/**", []string{
+ dirPath + "/file.txt",
+ dirPath + "/dir/file.txt",
+ dirPath + "/dir/subdir/file.txt",
+ dirPath + "/other.txt",
+ }},
+ // dir/* should only match immediate children
+ {dirPath + "/dir/*", []string{dirPath + "/dir/file.txt"}},
+ // dir/** should match all descendants
+ {dirPath + "/dir/**", []string{
+ dirPath + "/dir/file.txt",
+ dirPath + "/dir/subdir/file.txt",
+ }},
+ // Deeply nested ** matching (core scenario from issue #38059)
+ {dirPath + "/dir/subdir/**", []string{
+ dirPath + "/dir/subdir/file.txt",
+ }},
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.glob, func(t *testing.T) {
+ got, err := c.List(ctx, tt.glob)
+ if err != nil {
+ t.Fatalf("List(%q) error = %v", tt.glob, err)
+ }
+
+ sort.Strings(got)
+ sort.Strings(tt.want)
+
+ if !cmp.Equal(got, tt.want) {
+ t.Errorf("List(%q) = %v, want %v", tt.glob,
got, tt.want)
+ }
+ })
+ }
+}
+
func createFakeGCSServer(tb testing.TB) *fakestorage.Server {
tb.Helper()