This is an automated email from the ASF dual-hosted git repository.
derrickaw pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git
The following commit(s) were added to refs/heads/master by this push:
new 30e98f7fd52 yaml_transform_test - add some debug logs and increase row
count (#38367)
30e98f7fd52 is described below
commit 30e98f7fd520f99d29c3f978c0802d19f5bdf5f7
Author: Derrick Williams <[email protected]>
AuthorDate: Fri May 8 09:59:07 2026 -0400
yaml_transform_test - add some debug logs and increase row count (#38367)
* add some debug logs and increase row count
* Update sdks/python/apache_beam/yaml/yaml_transform_test.py
Co-authored-by: gemini-code-assist[bot]
<176961590+gemini-code-assist[bot]@users.noreply.github.com>
---------
Co-authored-by: gemini-code-assist[bot]
<176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
.../python/apache_beam/yaml/yaml_transform_test.py | 24 +++++++++-------------
1 file changed, 10 insertions(+), 14 deletions(-)
diff --git a/sdks/python/apache_beam/yaml/yaml_transform_test.py
b/sdks/python/apache_beam/yaml/yaml_transform_test.py
index a4da97f7f50..bbb60b185c0 100644
--- a/sdks/python/apache_beam/yaml/yaml_transform_test.py
+++ b/sdks/python/apache_beam/yaml/yaml_transform_test.py
@@ -253,17 +253,8 @@ class YamlTransformE2ETest(unittest.TestCase):
raise unittest.SkipTest('Pandas not available.')
with tempfile.TemporaryDirectory() as tmpdir:
- data = pd.DataFrame([
- {
- 'label': '11a', 'rank': 0
- },
- {
- 'label': '37a', 'rank': 1
- },
- {
- 'label': '389a', 'rank': 2
- },
- ])
+ data = pd.DataFrame([{'label': f'{i}a', 'rank': i} for i in range(1024)])
+
input = os.path.join(tmpdir, 'input.csv')
output = os.path.join(tmpdir, 'output.json')
data.to_csv(input, index=False)
@@ -286,9 +277,14 @@ class YamlTransformE2ETest(unittest.TestCase):
num_shards: 1
- type: LogForTesting
''' % (repr(input), repr(output)))
- all_output = list(glob.glob(output + "*"))
- self.assertEqual(len(all_output), 1)
- output_shard = list(glob.glob(output + "*"))[0]
+ all_output = list(glob.glob(output + "-*"))
+ file_and_size = {f: os.path.getsize(f) for f in all_output}
+ self.assertEqual(
+ len(all_output),
+ 1,
+ msg=f"Expected 1 shard file, but found {len(all_output)}. "
+ f"Files & sizes (bytes): {file_and_size}")
+ output_shard = all_output[0]
result = pd.read_json(
output_shard, orient='records',
lines=True).sort_values('rank').reindex()