[ https://issues.apache.org/jira/browse/BEAM-4474?focusedWorklogId=110724&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-110724 ]
ASF GitHub Bot logged work on BEAM-4474: ---------------------------------------- Author: ASF GitHub Bot Created on: 11/Jun/18 19:30 Start Date: 11/Jun/18 19:30 Worklog Time Spent: 10m Work Description: herohde closed pull request #5557: [BEAM-4474] Ensure unbounded Dataflow jobs are submitted as streaming jobs URL: https://github.com/apache/beam/pull/5557 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/sdks/go/pkg/beam/core/graph/node.go b/sdks/go/pkg/beam/core/graph/node.go index d3acf1f0c18..9ed998f28b2 100644 --- a/sdks/go/pkg/beam/core/graph/node.go +++ b/sdks/go/pkg/beam/core/graph/node.go @@ -83,3 +83,13 @@ func NodeTypes(list []*Node) []typex.FullType { } return ret } + +// Bounded returns true iff all nodes are bounded. +func Bounded(ns []*Node) bool { + for _, n := range ns { + if !n.Bounded() { + return false + } + } + return true +} diff --git a/sdks/go/pkg/beam/runners/dataflow/dataflow.go b/sdks/go/pkg/beam/runners/dataflow/dataflow.go index 967d17be963..9bdc2acb141 100644 --- a/sdks/go/pkg/beam/runners/dataflow/dataflow.go +++ b/sdks/go/pkg/beam/runners/dataflow/dataflow.go @@ -45,6 +45,7 @@ import ( "golang.org/x/oauth2/google" df "google.golang.org/api/dataflow/v1b3" "google.golang.org/api/storage/v1" + "github.com/apache/beam/sdks/go/pkg/beam/core/graph" ) // TODO(herohde) 5/16/2017: the Dataflow flags should match the other SDKs. @@ -60,7 +61,6 @@ var ( network = flag.String("network", "", "GCP network (optional)") tempLocation = flag.String("temp_location", "", "Temp location (optional)") machineType = flag.String("worker_machine_type", "", "GCE machine type (optional)") - streaming = flag.Bool("streaming", false, "Streaming job") dryRun = flag.Bool("dry_run", false, "Dry run. Just print the job, but don't submit it.") teardownPolicy = flag.String("teardown_policy", "", "Job teardown policy (internal only).") @@ -103,7 +103,7 @@ func Execute(ctx context.Context, p *beam.Pipeline) error { } jobName := jobopts.GetJobName() - edges, _, err := p.Build() + edges, nodes, err := p.Build() if err != nil { return err } @@ -169,7 +169,9 @@ func Execute(ctx context.Context, p *beam.Pipeline) error { jobType := "JOB_TYPE_BATCH" apiJobType := "FNAPI_BATCH" - if *streaming { + + streaming := !graph.Bounded(nodes) + if streaming { jobType = "JOB_TYPE_STREAMING" apiJobType = "FNAPI_STREAMING" } @@ -223,7 +225,7 @@ func Execute(ctx context.Context, p *beam.Pipeline) error { if *tempLocation != "" { job.Environment.TempStoragePrefix = *tempLocation } - if *streaming { + if streaming { // Add separate data disk for streaming jobs job.Environment.WorkerPools[0].DataDisks = []*df.Disk{{}} } ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking ------------------- Worklog Id: (was: 110724) Time Spent: 50m (was: 40m) > Ensure unbounded Go pipelines are not run in batch on Dataflow > -------------------------------------------------------------- > > Key: BEAM-4474 > URL: https://issues.apache.org/jira/browse/BEAM-4474 > Project: Beam > Issue Type: Bug > Components: sdk-go > Reporter: Henning Rohde > Assignee: Henning Rohde > Priority: Major > Time Spent: 50m > Remaining Estimate: 0h > > This causes the job to fail. Specifically, streaming_wordcap currently needs > the option "--streaming=true" to work, but it shouldn't be necessary given > the pubsub source returns an unbounded PCollection. > Instead of a streaming flag, the Dataflow runner should instead inspect the > PCollections to determine whether to run batch or streaming. -- This message was sent by Atlassian JIRA (v7.6.3#76005)