tuxji commented on a change in pull request #525: URL: https://github.com/apache/daffodil/pull/525#discussion_r612464207
########## File path: README.md ########## @@ -23,107 +26,134 @@ [<img src="https://img.shields.io/maven-central/v/org.apache.daffodil/daffodil-core_2.12.svg?color=brightgreen&label=version" align="right"/>][Releases] <br clear="both" /> -Apache Daffodil is an open-source implementation of the [DFDL specification] -that uses DFDL data descriptions to parse fixed format data into an infoset. -This infoset is commonly converted into XML or JSON to enable the use of -well-established XML or JSON technologies and libraries to consume, inspect, -and manipulate fixed format data in existing solutions. Daffodil is also -capable of serializing or "unparsing" data back to the original data format. -The DFDL infoset can also be converted directly to/from the data structures -carried by data processing frameworks so as to bypass any XML/JSON overheads. +Apache Daffodil is an open-source implementation of the [DFDL +specification] that uses DFDL data descriptions to parse fixed format +data into an infoset. This infoset is commonly converted into XML or +JSON to enable the use of well-established XML or JSON technologies +and libraries to consume, inspect, and manipulate fixed format data in +existing solutions. Daffodil is also capable of serializing or +"unparsing" data back to the original data format. The DFDL infoset +can also be converted directly to/from the data structures carried by +data processing frameworks so as to bypass any XML/JSON overheads. -For more information about Daffodil, see https://daffodil.apache.org/. +For more information about Daffodil, see the [Website]. Review comment: Sure, I put the full URL back with brackets around it to avoid markdownlint's bare url warning (that was why I changed it before). ########## File path: daffodil-runtime2/src/main/resources/c/libruntime/errors.c ########## @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "errors.h" +#include <error.h> // for error +#include <inttypes.h> // for PRId64 +#include <stdio.h> // for NULL, feof, ferror, FILE, size_t +#include <stdlib.h> // for EXIT_FAILURE + +// error_message - return an internationalized error message + +static const char * +error_message(enum ErrorCode code) +{ + switch (code) + { + case ERR_CHOICE_KEY: + return "no match between choice dispatch key %" PRId64 + " and any branch key"; + case ERR_FILE_CLOSE: + return "error closing file"; + case ERR_FILE_FLUSH: + return "error flushing stream to file"; + case ERR_FILE_OPEN: + return "error opening file '%s'"; + case ERR_FIXED_VALUE: + return "value of element '%s' does not match value of its " + "'fixed' attribute"; + case ERR_INFOSET_READ: + return "cannot read infoset type '%s'"; + case ERR_INFOSET_WRITE: + return "cannot write infoset type '%s'"; + case ERR_PARSE_BOOL: + return "error parsing binary value %" PRId64 " as either true or false"; + case ERR_STACK_EMPTY: + return "stack empty, stopping program"; + case ERR_STACK_OVERFLOW: + return "stack overflow, stopping program"; + case ERR_STACK_UNDERFLOW: + return "stack underflow, stopping program"; + case ERR_STREAM_EOF: + return "EOF in stream, stopping program"; + case ERR_STREAM_ERROR: + return "error in stream, stopping program"; + case ERR_STRTOBOOL: + return "error converting XML data '%s' to boolean"; + case ERR_STRTOD_ERRNO: + return "error converting XML data '%s' to number"; + case ERR_STRTOI_ERRNO: + return "error converting XML data '%s' to integer"; + case ERR_STRTONUM_EMPTY: + return "found no number in XML data '%s'"; + case ERR_STRTONUM_NOT: + return "found non-number characters in XML data '%s'"; + case ERR_STRTONUM_RANGE: + return "number in XML data '%s' out of range"; + case ERR_XML_DECL: + return "error making new XML declaration"; + case ERR_XML_ELEMENT: + return "error making new XML element '%s'"; + case ERR_XML_ERD: + return "unexpected ERD typeCode %" PRId64 " while reading XML data"; + case ERR_XML_GONE: + return "ran out of XML data"; + case ERR_XML_INPUT: + return "unable to read XML data from input file"; + case ERR_XML_LEFT: + return "did not consume all of the XML data, '%s' left"; + case ERR_XML_MISMATCH: + return "found mismatch between XML data and infoset '%s'"; + case ERR_XML_WRITE: + return "error writing XML document"; + default: + return "unrecognized error code, shouldn't happen"; + } +} + +// print_maybe_stop - print a message and maybe stop the program + +static void +print_maybe_stop(const Error *err, int status) +{ + const int errnum = 0; + const char *format = "%s"; + const char *msg = error_message(err->code); + + switch (err->code) + { + case ERR_FILE_OPEN: + case ERR_FIXED_VALUE: + case ERR_INFOSET_READ: + case ERR_INFOSET_WRITE: + case ERR_STRTOBOOL: + case ERR_STRTOD_ERRNO: + case ERR_STRTOI_ERRNO: + case ERR_STRTONUM_EMPTY: + case ERR_STRTONUM_NOT: + case ERR_STRTONUM_RANGE: + case ERR_XML_ELEMENT: + case ERR_XML_LEFT: + case ERR_XML_MISMATCH: + error(status, errnum, msg, err->s); + break; + case ERR_CHOICE_KEY: + case ERR_PARSE_BOOL: + case ERR_XML_ERD: + error(status, errnum, msg, err->d64); + break; + default: + error(status, errnum, format, msg); + break; + } +} + +// need_diagnostics - return pointer to validation diagnostics + +Diagnostics * +need_diagnostics(void) +{ + static Diagnostics validati; + return &validati; +} + +// add_diagnostic - add a new error to validation diagnostics + +void +add_diagnostic(Diagnostics *validati, const Error *error) +{ + if (validati && error) + { + if (validati->length < + sizeof(validati->array) / sizeof(*validati->array)) Review comment: Makes sense. I've made add_diagnostic return false if the array was too full to add another diagnostic. ########## File path: README.md ########## @@ -23,107 +26,134 @@ [<img src="https://img.shields.io/maven-central/v/org.apache.daffodil/daffodil-core_2.12.svg?color=brightgreen&label=version" align="right"/>][Releases] <br clear="both" /> -Apache Daffodil is an open-source implementation of the [DFDL specification] -that uses DFDL data descriptions to parse fixed format data into an infoset. -This infoset is commonly converted into XML or JSON to enable the use of -well-established XML or JSON technologies and libraries to consume, inspect, -and manipulate fixed format data in existing solutions. Daffodil is also -capable of serializing or "unparsing" data back to the original data format. -The DFDL infoset can also be converted directly to/from the data structures -carried by data processing frameworks so as to bypass any XML/JSON overheads. +Apache Daffodil is an open-source implementation of the [DFDL +specification] that uses DFDL data descriptions to parse fixed format +data into an infoset. This infoset is commonly converted into XML or +JSON to enable the use of well-established XML or JSON technologies +and libraries to consume, inspect, and manipulate fixed format data in +existing solutions. Daffodil is also capable of serializing or +"unparsing" data back to the original data format. The DFDL infoset +can also be converted directly to/from the data structures carried by +data processing frameworks so as to bypass any XML/JSON overheads. -For more information about Daffodil, see https://daffodil.apache.org/. +For more information about Daffodil, see the [Website]. ## Build Requirements * JDK 8 or higher * SBT 0.13.8 or higher -* C compiler (for daffodil-runtime2 only) -* Mini-XML Version 3.2 or higher (for daffodil-runtime2 only) +* C compiler C99 or higher +* Mini-XML Version 3.2 or higher + +Since Daffodil has a DFDL to C backend, you will need a C compiler +([gcc] or [clang]), the [Mini-XML] library, and possibly the GNU +[argp] library if your system's C library doesn't include it. You can +install gcc and libmxml as system packages on most Unix based +platforms with distribution-specific packager commands such as (Debian +and Ubuntu): + + # Just mentioning all other packages you might need too + sudo apt install build-essential curl git libmxml-dev + +You will need the Java Software Development Kit ([JDK]) and the Scala +Build Tool ([SBT]) to build Daffodil, run all tests, create packages, +and more. [SDK] offers an easy and uniform way to install both java +and sbt on any Unix based platform: + + curl -s "https://get.sdkman.io" | bash + sdk install java + sdk install sbt + +You can edit the Compile / cCompiler setting in build.sbt if you don't +want sbt to call your C compiler with "cc" as the driver command. Review comment: Unfortunately, only Daffodil has logic for checking an environment variable and trying a list of known driver commands to find a C compiler. However, your `set Compiler / cCompiler := "gcc"` line makes me realize we could embed at least the environment variable logic into build.sbt. I've made a one-line change to build.sbt that sets cCompiler to either "cc" or the value of the environment variable "CC". Also note that setting CC = "true" could be one way people could avoid calling the C compiler although some unit/TDML tests still would break. ########## File path: daffodil-runtime2/src/main/resources/c/libcli/xml_reader.c ########## @@ -53,41 +54,44 @@ strtobool(const char *numptr, const char **errstrp) } else { - error_msg = "Error converting XML data to boolean"; + static Error error = {ERR_STRTOBOOL, {NULL}}; + error.s = numptr; + *errorptr = &error; } - *errstrp = error_msg; return value; } // Convert an XML element's text to a double (call strtod with our own // error checking) static double -strtodnum(const char *numptr, const char **errstrp) +strtodnum(const char *numptr, const Error **errorptr) { char *endptr = NULL; // Clear errno to detect error after calling strtod errno = 0; const double value = strtod(numptr, &endptr); - // Report any issues converting the string to a number + // Check for any errors converting the string to a number if (errno != 0) { - *errstrp = "Error converting XML data to number"; + static Error error = {ERR_STRTOD_ERRNO, {NULL}}; + error.s = numptr; + *errorptr = &error; Review comment: Yes, I want to avoid allocations here for discipline and consistency even though this is libcli code, not libruntime code. We are generating C code for an embedded processor now and HLS C code for a FPGA chip later, so we have severe constraints on the C code: all libruntime code is "bare metal" (no system calls to OS), no allocations, no recursive functions, etc. At least we have no other threads to worry about. Our caller must receive a pointer that either is NULL or points to an initialized Error struct. We can initialize a local Error struct in one line but our function's stack frame will be popped before the caller gets the pointer. We can call a helper function (get_error(...)) and pass it a local Error struct but that function would have to initialize its own static Error struct anyway (or array of Error structs if we want X calls of get_error(...) to return different pointers). ########## File path: daffodil-runtime2/src/main/scala/org/apache/daffodil/runtime2/generators/BinaryAbstractCodeGenerator.scala ########## @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.daffodil.runtime2.generators + +import org.apache.daffodil.dsom.ElementBase +import org.apache.daffodil.schema.annotation.props.gen.BitOrder +import org.apache.daffodil.schema.annotation.props.gen.ByteOrder +import org.apache.daffodil.schema.annotation.props.gen.OccursCountKind + +trait BinaryAbstractCodeGenerator { + + def binaryAbstractGenerateCode(e: ElementBase, initialValue: String, prim: String, + parseArgs: String, unparseArgs: String, cgState: CodeGeneratorState): Unit = { + + // For the time being this is a very limited back end. + // So there are some restrictions to enforce. + e.schemaDefinitionUnless(e.bitOrder eq BitOrder.MostSignificantBitFirst, "Only dfdl:bitOrder 'mostSignificantBitFirst' is supported.") + e.schemaDefinitionUnless(e.byteOrderEv.isConstant, "Runtime dfdl:byteOrder expressions not supported.") + e.schemaDefinitionUnless(e.elementLengthInBitsEv.isConstant, "Runtime dfdl:length expressions not supported.") + + val fieldName = e.namedQName.local + val byteOrder = e.byteOrderEv.constValue + val conv = if (byteOrder eq ByteOrder.BigEndian) "be" else "le" + val arraySize = if (e.occursCountKind == OccursCountKind.Fixed) e.maxOccurs else 0 + val fixed = e.xml.attribute("fixed") + val fixedValue = if (fixed.isDefined) fixed.get.text else "" Review comment: I've moved the fixed value logic to the DSOM and updated BinaryAbstractCodeGenerator.scala to use the DSOM getters. ########## File path: daffodil-runtime2/src/main/resources/c/libruntime/errors.h ########## @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ERRORS_H +#define ERRORS_H + +#include <stdio.h> // for FILE, size_t +#include <stdint.h> // for int64_t + +// ErrorCode - types of errors which could occur + +enum ErrorCode +{ + ERR_CHOICE_KEY, + ERR_FILE_CLOSE, + ERR_FILE_FLUSH, + ERR_FILE_OPEN, + ERR_FIXED_VALUE, + ERR_INFOSET_READ, + ERR_INFOSET_WRITE, + ERR_PARSE_BOOL, + ERR_STACK_EMPTY, + ERR_STACK_OVERFLOW, + ERR_STACK_UNDERFLOW, + ERR_STREAM_EOF, + ERR_STREAM_ERROR, + ERR_STRTOBOOL, + ERR_STRTOD_ERRNO, + ERR_STRTOI_ERRNO, + ERR_STRTONUM_EMPTY, + ERR_STRTONUM_NOT, + ERR_STRTONUM_RANGE, + ERR_XML_DECL, + ERR_XML_ELEMENT, + ERR_XML_ERD, + ERR_XML_GONE, + ERR_XML_INPUT, + ERR_XML_LEFT, + ERR_XML_MISMATCH, + ERR_XML_WRITE +}; + +// Error - specific error occuring now + +typedef struct Error +{ + enum ErrorCode code; + union + { + const char *s; // for %s + int64_t d64; // for %d64 + }; +} Error; + +// Diagnostics - array of validation errors + +typedef struct Diagnostics +{ + Error array[100]; + size_t length; +} Diagnostics; + +// PState - mutable state while parsing data + +typedef struct PState +{ + FILE * stream; // input to read data from + size_t position; // 0-based position in stream + Diagnostics *validati; // any validation diagnostics + const Error *error; // any error which stops program +} PState; + +// UState - mutable state while unparsing infoset + +typedef struct UState +{ + FILE * stream; // output to write data to + size_t position; // 0-based position in stream + Diagnostics *validati; // any validation diagnostics + const Error *error; // any error which stops program +} UState; + +// need_diagnostics - return pointer to validation diagnostics + +extern Diagnostics *need_diagnostics(void); Review comment: True, we could make the PState/UState have a Diagnostics instance itself. Then there would be no need for a get_diagnostics function. My thinking is that the majority of schemas I've seen don't use fixed values so I don't want to even call get_diagnostics and initialize the Diagnostics instance unless I have to. That thinking probably will change over time. ########## File path: README.md ########## @@ -23,107 +26,134 @@ [<img src="https://img.shields.io/maven-central/v/org.apache.daffodil/daffodil-core_2.12.svg?color=brightgreen&label=version" align="right"/>][Releases] <br clear="both" /> -Apache Daffodil is an open-source implementation of the [DFDL specification] -that uses DFDL data descriptions to parse fixed format data into an infoset. -This infoset is commonly converted into XML or JSON to enable the use of -well-established XML or JSON technologies and libraries to consume, inspect, -and manipulate fixed format data in existing solutions. Daffodil is also -capable of serializing or "unparsing" data back to the original data format. -The DFDL infoset can also be converted directly to/from the data structures -carried by data processing frameworks so as to bypass any XML/JSON overheads. +Apache Daffodil is an open-source implementation of the [DFDL +specification] that uses DFDL data descriptions to parse fixed format +data into an infoset. This infoset is commonly converted into XML or +JSON to enable the use of well-established XML or JSON technologies +and libraries to consume, inspect, and manipulate fixed format data in +existing solutions. Daffodil is also capable of serializing or +"unparsing" data back to the original data format. The DFDL infoset +can also be converted directly to/from the data structures carried by +data processing frameworks so as to bypass any XML/JSON overheads. -For more information about Daffodil, see https://daffodil.apache.org/. +For more information about Daffodil, see the [Website]. ## Build Requirements * JDK 8 or higher * SBT 0.13.8 or higher -* C compiler (for daffodil-runtime2 only) -* Mini-XML Version 3.2 or higher (for daffodil-runtime2 only) +* C compiler C99 or higher +* Mini-XML Version 3.2 or higher + +Since Daffodil has a DFDL to C backend, you will need a C compiler Review comment: The C backend is a code generator written in 100% Scala. The most important reason why we call the C compiler during the sbt build is to alert developers as soon as possible if any change anywhere causes a runtime2 problem. We call the C compiler in multiple places: 1. Explicitly in build.sbt (building daffodil-runtime2 always calls the sbt-cc plugin on all C source files) 2. Implicitly in `sbt test` (running daffodil-runtime2's unit/TDML tests and daffodil-test's runtime2-specific TDML tests also tests Daffodil's ability to call the C compiler) We probably could find a way (with a custom sbt plugin or some conditional code in build.sbt) to still build daffodil-runtime2 so that the CLI's "generate c" command still works but disable running both 1) the sbt-cc plugin and 2) the daffodil-runtime2 unit/TDML tests during a "special" sbt build. I'd prefer to put the onus on people to run a "special" sbt build that disables any C compilation so that we still alert developers to problems as early as possible (the longer problems hide, the more time it takes to fix them once they surface). You're more familiar with SBT's DSL scripting than I am; what's the easiest way to make build.sbt skip running sbt-cc and skip running runtime2-specific unit/TDML tests upon request? ########## File path: daffodil-runtime2/src/main/resources/c/libruntime/errors.h ########## @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ERRORS_H +#define ERRORS_H + +#include <stdio.h> // for FILE, size_t +#include <stdint.h> // for int64_t + +// ErrorCode - types of errors which could occur + +enum ErrorCode +{ + ERR_CHOICE_KEY, + ERR_FILE_CLOSE, + ERR_FILE_FLUSH, + ERR_FILE_OPEN, + ERR_FIXED_VALUE, + ERR_INFOSET_READ, + ERR_INFOSET_WRITE, + ERR_PARSE_BOOL, + ERR_STACK_EMPTY, + ERR_STACK_OVERFLOW, + ERR_STACK_UNDERFLOW, + ERR_STREAM_EOF, + ERR_STREAM_ERROR, + ERR_STRTOBOOL, + ERR_STRTOD_ERRNO, + ERR_STRTOI_ERRNO, + ERR_STRTONUM_EMPTY, + ERR_STRTONUM_NOT, + ERR_STRTONUM_RANGE, + ERR_XML_DECL, + ERR_XML_ELEMENT, + ERR_XML_ERD, + ERR_XML_GONE, + ERR_XML_INPUT, + ERR_XML_LEFT, + ERR_XML_MISMATCH, + ERR_XML_WRITE +}; + +// Error - specific error occuring now + +typedef struct Error +{ + enum ErrorCode code; + union + { + const char *s; // for %s + int64_t d64; // for %d64 + }; +} Error; + +// Diagnostics - array of validation errors + +typedef struct Diagnostics +{ + Error array[100]; + size_t length; +} Diagnostics; + +// PState - mutable state while parsing data + +typedef struct PState +{ + FILE * stream; // input to read data from + size_t position; // 0-based position in stream + Diagnostics *validati; // any validation diagnostics + const Error *error; // any error which stops program +} PState; + +// UState - mutable state while unparsing infoset + +typedef struct UState +{ + FILE * stream; // output to write data to + size_t position; // 0-based position in stream + Diagnostics *validati; // any validation diagnostics + const Error *error; // any error which stops program +} UState; + +// need_diagnostics - return pointer to validation diagnostics Review comment: No, your suggestion is better. Renamed to get_diagnostics. ########## File path: README.md ########## @@ -23,107 +26,134 @@ [<img src="https://img.shields.io/maven-central/v/org.apache.daffodil/daffodil-core_2.12.svg?color=brightgreen&label=version" align="right"/>][Releases] <br clear="both" /> -Apache Daffodil is an open-source implementation of the [DFDL specification] -that uses DFDL data descriptions to parse fixed format data into an infoset. -This infoset is commonly converted into XML or JSON to enable the use of -well-established XML or JSON technologies and libraries to consume, inspect, -and manipulate fixed format data in existing solutions. Daffodil is also -capable of serializing or "unparsing" data back to the original data format. -The DFDL infoset can also be converted directly to/from the data structures -carried by data processing frameworks so as to bypass any XML/JSON overheads. +Apache Daffodil is an open-source implementation of the [DFDL +specification] that uses DFDL data descriptions to parse fixed format +data into an infoset. This infoset is commonly converted into XML or +JSON to enable the use of well-established XML or JSON technologies +and libraries to consume, inspect, and manipulate fixed format data in +existing solutions. Daffodil is also capable of serializing or +"unparsing" data back to the original data format. The DFDL infoset +can also be converted directly to/from the data structures carried by +data processing frameworks so as to bypass any XML/JSON overheads. -For more information about Daffodil, see https://daffodil.apache.org/. +For more information about Daffodil, see the [Website]. ## Build Requirements * JDK 8 or higher * SBT 0.13.8 or higher -* C compiler (for daffodil-runtime2 only) -* Mini-XML Version 3.2 or higher (for daffodil-runtime2 only) +* C compiler C99 or higher +* Mini-XML Version 3.2 or higher + +Since Daffodil has a DFDL to C backend, you will need a C compiler +([gcc] or [clang]), the [Mini-XML] library, and possibly the GNU +[argp] library if your system's C library doesn't include it. You can +install gcc and libmxml as system packages on most Unix based +platforms with distribution-specific packager commands such as (Debian +and Ubuntu): + + # Just mentioning all other packages you might need too + sudo apt install build-essential curl git libmxml-dev + +You will need the Java Software Development Kit ([JDK]) and the Scala +Build Tool ([SBT]) to build Daffodil, run all tests, create packages, +and more. [SDK] offers an easy and uniform way to install both java +and sbt on any Unix based platform: + + curl -s "https://get.sdkman.io" | bash Review comment: You're right, I actually read the script carefully before I ran it and I shouldn't suggest to anyone it's okay to download a script from the Internet and pipe it to bash. If you already have sdk installed, though, then showing people how to install the JDK and SBT become very simple and uniform one-liners that work on any UNIX platform (sigh, wish sdk was everywhere). I'll just recommend people install JDK and SBT directly from their websites, though. ########## File path: README.md ########## @@ -23,107 +26,134 @@ [<img src="https://img.shields.io/maven-central/v/org.apache.daffodil/daffodil-core_2.12.svg?color=brightgreen&label=version" align="right"/>][Releases] <br clear="both" /> -Apache Daffodil is an open-source implementation of the [DFDL specification] -that uses DFDL data descriptions to parse fixed format data into an infoset. -This infoset is commonly converted into XML or JSON to enable the use of -well-established XML or JSON technologies and libraries to consume, inspect, -and manipulate fixed format data in existing solutions. Daffodil is also -capable of serializing or "unparsing" data back to the original data format. -The DFDL infoset can also be converted directly to/from the data structures -carried by data processing frameworks so as to bypass any XML/JSON overheads. +Apache Daffodil is an open-source implementation of the [DFDL +specification] that uses DFDL data descriptions to parse fixed format +data into an infoset. This infoset is commonly converted into XML or +JSON to enable the use of well-established XML or JSON technologies +and libraries to consume, inspect, and manipulate fixed format data in +existing solutions. Daffodil is also capable of serializing or +"unparsing" data back to the original data format. The DFDL infoset +can also be converted directly to/from the data structures carried by +data processing frameworks so as to bypass any XML/JSON overheads. -For more information about Daffodil, see https://daffodil.apache.org/. +For more information about Daffodil, see the [Website]. ## Build Requirements * JDK 8 or higher * SBT 0.13.8 or higher -* C compiler (for daffodil-runtime2 only) -* Mini-XML Version 3.2 or higher (for daffodil-runtime2 only) +* C compiler C99 or higher +* Mini-XML Version 3.2 or higher + +Since Daffodil has a DFDL to C backend, you will need a C compiler +([gcc] or [clang]), the [Mini-XML] library, and possibly the GNU +[argp] library if your system's C library doesn't include it. You can +install gcc and libmxml as system packages on most Unix based +platforms with distribution-specific packager commands such as (Debian +and Ubuntu): + + # Just mentioning all other packages you might need too + sudo apt install build-essential curl git libmxml-dev Review comment: I like the idea of putting the instructions in a separate markdown file better. I've moved everything I added to README.md into BUILD.md except for the four build requirements and "See BUILD.md for more details" below them. I still improved some parts of README.md anyway :). ########## File path: daffodil-runtime2/src/main/resources/c/libcli/daffodil_main.c ########## @@ -85,23 +93,25 @@ main(int argc, char *argv[]) output = fopen_or_exit(output, daffodil_parse.outfile, "w"); // Parse the input file into our infoset. - PState pstate = {input, 0, NULL}; + PState pstate = {input, 0, NULL, NULL}; root->erd->parseSelf(root, &pstate); - continue_or_exit(pstate.error_msg); + print_diagnostics(pstate.validati); Review comment: No, `validati` is not a typo, I just didn't want a field name longer than ~ 8 characters. I've changed it to `diagnostics` which is only 11 characters long anyway. ########## File path: daffodil-runtime2/src/main/resources/c/libruntime/errors.h ########## @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ERRORS_H +#define ERRORS_H + +#include <stdio.h> // for FILE, size_t +#include <stdint.h> // for int64_t + +// ErrorCode - types of errors which could occur + +enum ErrorCode +{ + ERR_CHOICE_KEY, + ERR_FILE_CLOSE, + ERR_FILE_FLUSH, + ERR_FILE_OPEN, + ERR_FIXED_VALUE, + ERR_INFOSET_READ, + ERR_INFOSET_WRITE, + ERR_PARSE_BOOL, + ERR_STACK_EMPTY, + ERR_STACK_OVERFLOW, + ERR_STACK_UNDERFLOW, + ERR_STREAM_EOF, + ERR_STREAM_ERROR, + ERR_STRTOBOOL, + ERR_STRTOD_ERRNO, + ERR_STRTOI_ERRNO, + ERR_STRTONUM_EMPTY, + ERR_STRTONUM_NOT, + ERR_STRTONUM_RANGE, + ERR_XML_DECL, + ERR_XML_ELEMENT, + ERR_XML_ERD, + ERR_XML_GONE, + ERR_XML_INPUT, + ERR_XML_LEFT, + ERR_XML_MISMATCH, + ERR_XML_WRITE +}; Review comment: ERR_FIXED_VALUE is used only as a validation error. The rest are considered fatal errors which cause the program to stop immediately. When runtime2 grows larger, then we may need ERR_, WRN_, VLD_, and TRY_ to distinguish different kinds of diagnostics, but we haven't gotten that far yet. ########## File path: daffodil-runtime2/src/main/resources/c/libcli/xml_writer.c ########## @@ -16,83 +16,93 @@ */ #include "xml_writer.h" -#include "stack.h" // for stack_is_empty, stack_pop, stack_push, stack_top, stack_init, stack_is_full #include <assert.h> // for assert -#include <mxml.h> // for mxmlNewOpaquef, mxml_node_t, mxmlElementSetAttr, mxmlNewElement, mxmlDelete, mxmlNewXML, mxmlSaveFile, MXML_NO_CALLBACK +#include <mxml.h> // for mxmlNewOpaquef, mxml_node_t, mxmlElementSetAttr, mxmlGetOpaque, mxmlNewElement, mxmlDelete, mxmlGetElement, mxmlNewXML, mxmlSaveFile, MXML_NO_CALLBACK #include <stdbool.h> // for bool #include <stdint.h> // for int16_t, int32_t, int64_t, int8_t, uint16_t, uint32_t, uint64_t, uint8_t -#include <stdio.h> // for NULL, fflush +#include <stdio.h> // for NULL #include <string.h> // for strcmp +#include "errors.h" // for Error, ERR_XML_DECL, ERR_XML_ELEMENT, ERR_XML_WRITE, Error::(anonymous) +#include "stack.h" // for stack_is_empty, stack_pop, stack_push, stack_top, stack_init -// Push new XML document on stack. This function is not -// thread-safe since it uses static storage. +// Push new XML document on stack (note the stack is stored in a +// static array which could overflow and stop the program; it also +// means none of those functions are thread-safe) -static const char * +static const Error * xmlStartDocument(XMLWriter *writer) { -#define MAX_DEPTH 100 + enum + { + MAX_DEPTH = 100 Review comment: Well, you have a good point. There are only three places where fixed size arrays occur and my hope was that we'd never need to change their limits, but I've declared each limit as an enum Limits constant inside errors.h now. ```c libruntime/errors.h 73: Error array[100]; libruntime/infoset.c 28: static char name[9999]; 69: static char xmlns[9999]; libcli/xml_writer.c 37: MAX_DEPTH = 100 39: static mxml_node_t *array[MAX_DEPTH]; ``` ########## File path: daffodil-runtime2/src/main/resources/c/libcli/daffodil_main.c ########## @@ -113,22 +123,24 @@ main(int argc, char *argv[]) if (strcmp(daffodil_unparse.infoset_converter, "xml") == 0) { // Initialize our infoset's values from the XML data. - XMLReader xmlReader = { - xmlReaderMethods, input, root, NULL, NULL}; - const char *error_msg = + XMLReader xmlReader = {xmlReaderMethods, input, root, NULL, + NULL}; + const Error *error = walkInfoset((VisitEventHandler *)&xmlReader, root); - continue_or_exit(error_msg); + continue_or_exit(error); } else { - error(EXIT_FAILURE, 0, "Cannot read infoset type '%s'", - daffodil_unparse.infoset_converter); + const Error error = {ERR_INFOSET_READ, + {daffodil_unparse.infoset_converter}}; Review comment: Good point. I've added `ColumnLimit: 110` to .clang_format and reformatted the C source files. ########## File path: daffodil-runtime2/src/main/resources/c/libruntime/errors.c ########## @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "errors.h" +#include <error.h> // for error +#include <inttypes.h> // for PRId64 +#include <stdio.h> // for NULL, feof, ferror, FILE, size_t +#include <stdlib.h> // for EXIT_FAILURE + +// error_message - return an internationalized error message + +static const char * +error_message(enum ErrorCode code) +{ + switch (code) + { + case ERR_CHOICE_KEY: + return "no match between choice dispatch key %" PRId64 + " and any branch key"; + case ERR_FILE_CLOSE: + return "error closing file"; + case ERR_FILE_FLUSH: + return "error flushing stream to file"; + case ERR_FILE_OPEN: + return "error opening file '%s'"; + case ERR_FIXED_VALUE: + return "value of element '%s' does not match value of its " + "'fixed' attribute"; + case ERR_INFOSET_READ: + return "cannot read infoset type '%s'"; + case ERR_INFOSET_WRITE: + return "cannot write infoset type '%s'"; + case ERR_PARSE_BOOL: + return "error parsing binary value %" PRId64 " as either true or false"; + case ERR_STACK_EMPTY: + return "stack empty, stopping program"; + case ERR_STACK_OVERFLOW: + return "stack overflow, stopping program"; + case ERR_STACK_UNDERFLOW: + return "stack underflow, stopping program"; + case ERR_STREAM_EOF: + return "EOF in stream, stopping program"; + case ERR_STREAM_ERROR: + return "error in stream, stopping program"; + case ERR_STRTOBOOL: + return "error converting XML data '%s' to boolean"; + case ERR_STRTOD_ERRNO: + return "error converting XML data '%s' to number"; + case ERR_STRTOI_ERRNO: + return "error converting XML data '%s' to integer"; + case ERR_STRTONUM_EMPTY: + return "found no number in XML data '%s'"; + case ERR_STRTONUM_NOT: + return "found non-number characters in XML data '%s'"; + case ERR_STRTONUM_RANGE: + return "number in XML data '%s' out of range"; + case ERR_XML_DECL: + return "error making new XML declaration"; + case ERR_XML_ELEMENT: + return "error making new XML element '%s'"; + case ERR_XML_ERD: + return "unexpected ERD typeCode %" PRId64 " while reading XML data"; + case ERR_XML_GONE: + return "ran out of XML data"; + case ERR_XML_INPUT: + return "unable to read XML data from input file"; + case ERR_XML_LEFT: + return "did not consume all of the XML data, '%s' left"; + case ERR_XML_MISMATCH: + return "found mismatch between XML data and infoset '%s'"; + case ERR_XML_WRITE: + return "error writing XML document"; + default: + return "unrecognized error code, shouldn't happen"; Review comment: I've inserted an assert call before the return but left the return unchanged since assert calls can be compiled out. ########## File path: daffodil-runtime2/src/main/resources/c/libcli/xml_reader.c ########## @@ -327,70 +346,75 @@ xmlNumberElem(XMLReader *reader, const ERD *erd, void *number) { if (strcmp(name_from_xml, name_from_erd) == 0) { + static Error error_erd = {ERR_XML_ERD, {NULL}}; + // Check for any errors getting the number - const char *errstr = NULL; + const Error *error = NULL; // Handle varying bit lengths of both signed & unsigned numbers const enum TypeCode typeCode = erd->typeCode; switch (typeCode) { case PRIMITIVE_BOOLEAN: - *(bool *)number = strtobool(number_from_xml, &errstr); + *(bool *)number = strtobool(number_from_xml, &error); break; case PRIMITIVE_FLOAT: - *(float *)number = strtofnum(number_from_xml, &errstr); + *(float *)number = strtofnum(number_from_xml, &error); break; case PRIMITIVE_DOUBLE: - *(double *)number = strtodnum(number_from_xml, &errstr); + *(double *)number = strtodnum(number_from_xml, &error); break; case PRIMITIVE_INT16: *(int16_t *)number = (int16_t)strtonum( - number_from_xml, INT16_MIN, INT16_MAX, &errstr); + number_from_xml, INT16_MIN, INT16_MAX, &error); break; case PRIMITIVE_INT32: *(int32_t *)number = (int32_t)strtonum( - number_from_xml, INT32_MIN, INT32_MAX, &errstr); + number_from_xml, INT32_MIN, INT32_MAX, &error); break; case PRIMITIVE_INT64: *(int64_t *)number = (int64_t)strtonum( - number_from_xml, INT64_MIN, INT64_MAX, &errstr); + number_from_xml, INT64_MIN, INT64_MAX, &error); break; case PRIMITIVE_INT8: *(int8_t *)number = (int8_t)strtonum(number_from_xml, INT8_MIN, - INT8_MAX, &errstr); + INT8_MAX, &error); break; case PRIMITIVE_UINT16: *(uint16_t *)number = - (uint16_t)strtounum(number_from_xml, UINT16_MAX, &errstr); + (uint16_t)strtounum(number_from_xml, UINT16_MAX, &error); break; case PRIMITIVE_UINT32: *(uint32_t *)number = - (uint32_t)strtounum(number_from_xml, UINT32_MAX, &errstr); + (uint32_t)strtounum(number_from_xml, UINT32_MAX, &error); break; case PRIMITIVE_UINT64: *(uint64_t *)number = - (uint64_t)strtounum(number_from_xml, UINT64_MAX, &errstr); + (uint64_t)strtounum(number_from_xml, UINT64_MAX, &error); break; case PRIMITIVE_UINT8: *(uint8_t *)number = - (uint8_t)strtounum(number_from_xml, UINT8_MAX, &errstr); + (uint8_t)strtounum(number_from_xml, UINT8_MAX, &error); break; default: - errstr = "Unexpected ERD typeCode while reading number from " - "XML data"; + error_erd.d64 = typeCode; + error = &error_erd; Review comment: I can define error_erd at line 400 following the default: label only if I enclose lines 400-401 inside braces (otherwise gcc prints `error: a label can only be part of a statement and a declaration is not a statement`). I had decided to define error_erd before the switch statement because clang-format wouldn't indent the braces (it left them lined up with the label and the statement's ending brace), but on second thought the error handling looks more uniform with error_erd defined inside a block so I'm doing it now. ########## File path: daffodil-runtime2/src/main/resources/c/libruntime/errors.c ########## @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "errors.h" +#include <error.h> // for error +#include <inttypes.h> // for PRId64 +#include <stdio.h> // for NULL, feof, ferror, FILE, size_t +#include <stdlib.h> // for EXIT_FAILURE + +// error_message - return an internationalized error message + +static const char * +error_message(enum ErrorCode code) +{ + switch (code) + { + case ERR_CHOICE_KEY: + return "no match between choice dispatch key %" PRId64 + " and any branch key"; + case ERR_FILE_CLOSE: + return "error closing file"; + case ERR_FILE_FLUSH: + return "error flushing stream to file"; + case ERR_FILE_OPEN: + return "error opening file '%s'"; + case ERR_FIXED_VALUE: + return "value of element '%s' does not match value of its " + "'fixed' attribute"; + case ERR_INFOSET_READ: + return "cannot read infoset type '%s'"; + case ERR_INFOSET_WRITE: + return "cannot write infoset type '%s'"; + case ERR_PARSE_BOOL: + return "error parsing binary value %" PRId64 " as either true or false"; + case ERR_STACK_EMPTY: + return "stack empty, stopping program"; + case ERR_STACK_OVERFLOW: + return "stack overflow, stopping program"; + case ERR_STACK_UNDERFLOW: + return "stack underflow, stopping program"; + case ERR_STREAM_EOF: + return "EOF in stream, stopping program"; + case ERR_STREAM_ERROR: + return "error in stream, stopping program"; + case ERR_STRTOBOOL: + return "error converting XML data '%s' to boolean"; + case ERR_STRTOD_ERRNO: + return "error converting XML data '%s' to number"; + case ERR_STRTOI_ERRNO: + return "error converting XML data '%s' to integer"; + case ERR_STRTONUM_EMPTY: + return "found no number in XML data '%s'"; + case ERR_STRTONUM_NOT: + return "found non-number characters in XML data '%s'"; + case ERR_STRTONUM_RANGE: + return "number in XML data '%s' out of range"; + case ERR_XML_DECL: + return "error making new XML declaration"; + case ERR_XML_ELEMENT: + return "error making new XML element '%s'"; + case ERR_XML_ERD: + return "unexpected ERD typeCode %" PRId64 " while reading XML data"; + case ERR_XML_GONE: + return "ran out of XML data"; + case ERR_XML_INPUT: + return "unable to read XML data from input file"; + case ERR_XML_LEFT: + return "did not consume all of the XML data, '%s' left"; + case ERR_XML_MISMATCH: + return "found mismatch between XML data and infoset '%s'"; + case ERR_XML_WRITE: + return "error writing XML document"; + default: + return "unrecognized error code, shouldn't happen"; + } +} + +// print_maybe_stop - print a message and maybe stop the program + +static void +print_maybe_stop(const Error *err, int status) +{ + const int errnum = 0; + const char *format = "%s"; + const char *msg = error_message(err->code); + + switch (err->code) + { + case ERR_FILE_OPEN: + case ERR_FIXED_VALUE: + case ERR_INFOSET_READ: + case ERR_INFOSET_WRITE: + case ERR_STRTOBOOL: + case ERR_STRTOD_ERRNO: + case ERR_STRTOI_ERRNO: + case ERR_STRTONUM_EMPTY: + case ERR_STRTONUM_NOT: + case ERR_STRTONUM_RANGE: + case ERR_XML_ELEMENT: + case ERR_XML_LEFT: + case ERR_XML_MISMATCH: + error(status, errnum, msg, err->s); + break; + case ERR_CHOICE_KEY: + case ERR_PARSE_BOOL: + case ERR_XML_ERD: + error(status, errnum, msg, err->d64); + break; + default: + error(status, errnum, format, msg); + break; + } +} + +// need_diagnostics - return pointer to validation diagnostics + +Diagnostics * +need_diagnostics(void) +{ + static Diagnostics validati; + return &validati; Review comment: Returning a pointer to a static variable defined inside a function is a good way to provide some storage without polluting the global namespace with a global variable's name, yes. It's also more flexible since we can change the function to use an array, a heap, etc., at a later time. ########## File path: daffodil-runtime2/src/main/scala/org/apache/daffodil/runtime2/generators/BinaryAbstractCodeGenerator.scala ########## @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.daffodil.runtime2.generators + +import org.apache.daffodil.dsom.ElementBase +import org.apache.daffodil.schema.annotation.props.gen.BitOrder +import org.apache.daffodil.schema.annotation.props.gen.ByteOrder +import org.apache.daffodil.schema.annotation.props.gen.OccursCountKind + +trait BinaryAbstractCodeGenerator { + + def binaryAbstractGenerateCode(e: ElementBase, initialValue: String, prim: String, + parseArgs: String, unparseArgs: String, cgState: CodeGeneratorState): Unit = { + + // For the time being this is a very limited back end. + // So there are some restrictions to enforce. + e.schemaDefinitionUnless(e.bitOrder eq BitOrder.MostSignificantBitFirst, "Only dfdl:bitOrder 'mostSignificantBitFirst' is supported.") + e.schemaDefinitionUnless(e.byteOrderEv.isConstant, "Runtime dfdl:byteOrder expressions not supported.") + e.schemaDefinitionUnless(e.elementLengthInBitsEv.isConstant, "Runtime dfdl:length expressions not supported.") + + val fieldName = e.namedQName.local + val byteOrder = e.byteOrderEv.constValue + val conv = if (byteOrder eq ByteOrder.BigEndian) "be" else "le" + val arraySize = if (e.occursCountKind == OccursCountKind.Fixed) e.maxOccurs else 0 + val fixed = e.xml.attribute("fixed") + val fixedValue = if (fixed.isDefined) fixed.get.text else "" + + def addStatements(deref: String): Unit = { + val initStatement = s" instance->$fieldName$deref = $initialValue;" + val parseStatement = + s""" parse_${conv}_$prim(&instance->$fieldName$deref, $parseArgs); + | if (pstate->error) return;""".stripMargin + val unparseStatement = + s""" unparse_${conv}_$prim(instance->$fieldName$deref, $unparseArgs); + | if (ustate->error) return;""".stripMargin + cgState.addSimpleTypeStatements(initStatement, parseStatement, unparseStatement) + + if (fixedValue.nonEmpty) { + val init2 = "" + val parse2 = + s""" parse_validate_fixed(instance->$fieldName$deref == $fixedValue, "$fieldName", pstate); Review comment: Yes, we had a security issue. Now that I'm using the DSOM to convert the XML attribute to the primitive data type and back to a string, a fixed='0, "foo", pstate); malicious C code here; //' will result in an error. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected]
