cgivre commented on code in PR #2836: URL: https://github.com/apache/drill/pull/2836#discussion_r1375364409
########## contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilBatchReader.java: ########## @@ -64,64 +69,97 @@ public DaffodilBatchReader (DaffodilReaderConfig readerConfig, EasySubScan scan, this.validationMode = formatConfig.getValidationMode(); // - // FIXME: Next, a MIRACLE occurs. + // FIXME: Where is this config file to be found? And, what is its syntax? // - // We get the dfdlSchemaURI filled in from the query, or a default config location - // We get the rootName (or null if not supplied) from the query, or a default config location - // We get the rootNamespace (or null if not supplied) from the query, or a default config location - // We get the validationMode (true/false) filled in from the query or a default config location - // We get the dataInputURI filled in from the query, or from a default config location - // - // For a first cut, let's just fake it. :-) + // FIXME: How do I arrange for these same things to be overriddable in the query + // or has that already happened before we get these things? - String rootName = null; - String rootNamespace = null; + DaffodilFormatConfig config = readerConfig.plugin.getConfig(); + boolean validationMode = config.getValidationMode(); + String dfdlSchemaURIString = config.getSchemaURI(); // "schema/complexArray1.dfdl.xsd"; + String rootName = config.getRootName(); + String rootNamespace = config.getRootNamespace(); + String dataInputURIString = config.getDataURI(); // "data/complexArray1.dat" URI dfdlSchemaURI; URI dataInputURI; - try { - dfdlSchemaURI = new URI("schema/complexArray1.dfdl.xsd"); - dataInputURI = new URI("data/complexArray1.dat"); + dfdlSchemaURI = new URI(dfdlSchemaURIString); + dataInputURI = new URI(dataInputURIString); } catch (URISyntaxException e) { throw UserException.validationError(e) - .message("Error retrieving DFDL schema files") .build(logger); } + DrillFileSystem fs = negotiator.file().fileSystem(); // FIXME: nagging me for a trywithresources? + URI fsSchemaURI = fs.getUri().resolve(dfdlSchemaURI); + URI fsDataURI = fs.getUri().resolve(dataInputURI); + Path fsDataPath = new Path(fsDataURI); + // + // METADATA TIME: Obtain Daffodil metadata, build Drill metadata + // // given dfdlSchemaURI and validation settings, and rootName, rootNamespace optionally - // get the Daffodil DataProcessor (aka parser static information) that we need, and from that - // we get the DaffodilMesageParser, which is a stateful driver for daffodil that actually does - // parsing. + // get the Daffodil DataProcessor (aka parser static information) that we need. + // + + // + // FIXME: resolve this issue about schema loading + // + // My hope is that this fsSchemaURI can be opened via toURL().openStream(), i.e., I + // don't have to call a DrillFileSystem method to open it. + // because if I do, that requires me to refactor getProcessor in Daffodil + // which has the code to determine whether this is a source xsd and to search classpath + // for component schemas, etc. + // DFDL schemas are not small. A good example of a schema is one that is 835 files spread + // over a rich directory structure spread over 5 jar files which must be searched in + // a specific search order (ex: CLASSPATH Order) + // Daffodil simply MUST be able to load, via ordinary getClass().getResource(uri) calls, + // all the include/import files that are expressed via relative and absolute paths in + // the schema files. + // + // Daffodil also wants a URI here so that it can issue + // diagnostics which refer to it. + // + // If it is a pre-compiled binary schema then the issue is just that getProcessor() caches + // these so they're not reloaded over and over for a series of tests. + // + + DaffodilDataProcessorFactory dpf = new DaffodilDataProcessorFactory(); DataProcessor dp; try { - dp = dpf.getDataProcessor(dfdlSchemaURI, true, rootName, rootNamespace); + dp = dpf.getDataProcessor(fsSchemaURI, validationMode, rootName, rootNamespace); } catch (Exception e) { throw UserException.dataReadError(e) - .message(String.format("Failed to get Daffodil DFDL processor for: %s", dfdlSchemaURI.toString())) + .message(String.format("Failed to get Daffodil DFDL processor for: %s", fsSchemaURI)) .addContext(errorContext).addContext(e.getMessage()).build(logger); } - // Create the corresponding Drill schema + // Create the corresponding Drill schema. + // Note: this could be a very large schema. Think of a large complex RDBMS schema, + // all of it, hundreds of tables, but all part of the same metadata tree. TupleMetadata drillSchema = daffodilDataProcessorToDrillSchema(dp); // Inform Drill about the schema negotiator.tableSchema(drillSchema, true); + // - // FIXME: Now a MIRACLE occurs. We get the drill row writer (actually a rowSetLoader)?? + // DATA TIME: Next we construct the runtime objects, and open files. // + // We get the DaffodilMessageParser, which is a stateful driver for daffodil that + // actually does the parsing. rowSetLoader = negotiator.build().writer(); // FIXME: is this right? // We construct the Daffodil InfosetOutputter which the daffodil parser uses to // convert infoset event calls to fill in a Drill row via a rowSetLoader. DaffodilDrillInfosetOutputter outputter = new DaffodilDrillInfosetOutputter(rowSetLoader); - // Now we can setup the dafParser with the outputter it will drive with the parser-produced - // infoset. + Review Comment: The `rowSetLoader` is correct. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: dev-unsubscr...@drill.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org