kojiromike commented on a change in pull request #936:
URL: https://github.com/apache/avro/pull/936#discussion_r461544142



##########
File path: lang/py/avro/schema.py
##########
@@ -521,6 +541,21 @@ def to_json(self, names=None):
         else:
             return self.props
 
+    def validate(self, datum):
+        """Return self if datum is a valid representation of this type of 
primitive schema, else None"""
+        is_valid = {
+            'null': lambda x: x is None,
+            'boolean': lambda x: isinstance(x, bool),
+            'string': lambda x: isinstance(x, unicode),
+            'bytes': lambda x: isinstance(x, bytes),
+            'int': lambda x: isinstance(x, int) and INT_MIN_VALUE <= x <= 
INT_MAX_VALUE,
+            'long': lambda x: isinstance(x, int) and LONG_MIN_VALUE <= x <= 
LONG_MAX_VALUE,
+            'float': lambda x: isinstance(x, (int, float)),
+            'double': lambda x: isinstance(x, (int, float)),
+        }.get(self.type, lambda x: False)(datum)

Review comment:
       Just having the primitive schema validation isolated from _all the 
validators_ is nice.

##########
File path: lang/py/avro/schema.py
##########
@@ -919,6 +984,10 @@ def to_json(self, names=None):
         to_dump['fields'] = [f.to_json(names) for f in self.fields]
         return to_dump
 
+    def validate(self, datum):
+        """Return self if datum is a valid representation of this schema, else 
None"""
+        return self if isinstance(datum, dict) and {f.name for f in 
self.fields}.issuperset(datum.keys()) else None

Review comment:
       Should this not validate the field values?

##########
File path: lang/py/avro/schema.py
##########
@@ -521,6 +541,21 @@ def to_json(self, names=None):
         else:
             return self.props
 
+    def validate(self, datum):
+        """Return self if datum is a valid representation of this type of 
primitive schema, else None"""
+        is_valid = {
+            'null': lambda x: x is None,
+            'boolean': lambda x: isinstance(x, bool),
+            'string': lambda x: isinstance(x, unicode),
+            'bytes': lambda x: isinstance(x, bytes),
+            'int': lambda x: isinstance(x, int) and INT_MIN_VALUE <= x <= 
INT_MAX_VALUE,
+            'long': lambda x: isinstance(x, int) and LONG_MIN_VALUE <= x <= 
LONG_MAX_VALUE,
+            'float': lambda x: isinstance(x, (int, float)),
+            'double': lambda x: isinstance(x, (int, float)),
+        }.get(self.type, lambda x: False)(datum)

Review comment:
       I agree that this is not ideal. I'd prefer the types to exist outright, 
instead of `PrimitiveSchema`, just `IntSchema`, etc. Then mapping validators to 
types is natural. I've tried to do this several times, but got hung up in 
complexities. (There are a lot of Liskov violations in here.) IMO it's always 
worth another shot, but I wouldn't suggest taking it on in this PR, since this 
is already a big improvement.
   
   I'd suggest making this dictionary a class attribute and then deciding the 
validator at init time. But other than that I think it's about as good as it 
can be without going too far off track.

##########
File path: lang/py/avro/io.py
##########
@@ -112,84 +164,218 @@ def __init__(self, fail_msg, writers_schema=None, 
readers_schema=None):
             fail_msg += "\nReader's Schema: %s" % pretty_readers
         schema.AvroException.__init__(self, fail_msg)
 
+
 #
 # Validate
 #
 
-
 def _is_timezone_aware_datetime(dt):
     return dt.tzinfo is not None and dt.tzinfo.utcoffset(dt) is not None
 
 
-_valid = {
-    'null': lambda s, d: d is None,
-    'boolean': lambda s, d: isinstance(d, bool),
-    'string': lambda s, d: isinstance(d, unicode),
-    'bytes': lambda s, d: ((isinstance(d, bytes)) or
-                           (isinstance(d, Decimal) and
-                            getattr(s, 'logical_type', None) == 
constants.DECIMAL)),
-    'int': lambda s, d: ((isinstance(d, (int, long))) and (INT_MIN_VALUE <= d 
<= INT_MAX_VALUE) or
-                         (isinstance(d, datetime.date) and
-                          getattr(s, 'logical_type', None) == constants.DATE) 
or
-                         (isinstance(d, datetime.time) and
-                          getattr(s, 'logical_type', None) == 
constants.TIME_MILLIS)),
-    'long': lambda s, d: ((isinstance(d, (int, long))) and (LONG_MIN_VALUE <= 
d <= LONG_MAX_VALUE) or
-                          (isinstance(d, datetime.time) and
-                           getattr(s, 'logical_type', None) == 
constants.TIME_MICROS) or
-                          (isinstance(d, datetime.date) and
-                           _is_timezone_aware_datetime(d) and
-                           getattr(s, 'logical_type', None) in 
(constants.TIMESTAMP_MILLIS,
-                                                                
constants.TIMESTAMP_MICROS))),
-    'float': lambda s, d: isinstance(d, (int, long, float)),
-    'fixed': lambda s, d: ((isinstance(d, bytes) and len(d) == s.size) or
-                           (isinstance(d, Decimal) and
-                            getattr(s, 'logical_type', None) == 
constants.DECIMAL)),
-    'enum': lambda s, d: d in s.symbols,
-
-    'array': lambda s, d: isinstance(d, list) and all(validate(s.items, item) 
for item in d),
-    'map': lambda s, d: (isinstance(d, dict) and all(isinstance(key, unicode) 
for key in d) and
-                         all(validate(s.values, value) for value in 
d.values())),
-    'union': lambda s, d: any(validate(branch, d) for branch in s.schemas),
-    'record': lambda s, d: (isinstance(d, dict) and
-                            all(validate(f.type, d.get(f.name)) for f in 
s.fields) and
-                            {f.name for f in s.fields}.issuperset(d.keys())),
+def validate(expected_schema, datum, raise_on_error=False):
+    """Return True if the provided datum is valid for the excpted schema
+
+    If raise_on_error is passed and True, then raise a validation error
+    with specific information about the error encountered in validation.
+
+    :param expected_schema: An avro schema type object representing the schema 
against
+                            which the datum will be validated.
+    :param datum: The datum to be validated, A python dictionary or some 
supported type
+    :param raise_on_error: True if a AvroTypeException should be raised 
immediately when a
+                           validation problem is encountered.
+    :raises: AvroTyeException if datum is invalid and raise_on_error is True
+    :returns: True if datum is valid for expected_schema, False if not.
+    """
+    # use a FIFO queue to process schema nodes breadth first.
+    nodes = deque()
+    nodes.append(ValidationNode(expected_schema, datum, 
getattr(expected_schema, "name", None)))
+
+    while nodes:
+        current_node = nodes.popleft()
+
+        # _validate_node returns the node for iteration if it is valid. Or it 
returns None
+        valid_node = _validate_node(current_node)
+
+        if valid_node is not None:
+            # if there are children of this node to append, do so.
+            for child_node in _iterate_node(valid_node):
+                nodes.append(child_node)
+        else:
+            # the current node was not valid.
+            if raise_on_error:
+                raise AvroTypeException(current_node.schema, 
current_node.datum)
+            else:
+                # preserve the prior validation behavior of returning false 
when there are problems.
+                return False
+
+    return True
+
+
+def _validate_node(node):
+    """Return the result of applying the appropriate validator function to the 
provided node"""
+    # breakpoint()
+    key = (node.schema.type, getattr(node.schema, 'logical_type', None))
+    return _VALIDATORS.get(key, _default_validator)(node)
+
+
+def _iterate_node(node):
+    for item in _ITERATORS.get(node.schema.type, _default_iterator)(node):
+        yield ValidationNode(*item)
+
+
+#############
+# Iteration #
+#############
+
+def _default_iterator(_):
+    """Immediately raise StopIteration.
+
+    This exists to prevent problems with iteration over unsupported container 
types.
+    """
+    for blank in []:

Review comment:
       Sigh. Yeah, OK, that's one for the mailing list, isn't it?

##########
File path: lang/py/avro/schema.py
##########
@@ -919,6 +984,10 @@ def to_json(self, names=None):
         to_dump['fields'] = [f.to_json(names) for f in self.fields]
         return to_dump
 
+    def validate(self, datum):
+        """Return self if datum is a valid representation of this schema, else 
None"""
+        return self if isinstance(datum, dict) and {f.name for f in 
self.fields}.issuperset(datum.keys()) else None

Review comment:
       Is this map schema? I thought it was record. Sorry! Very hard to tell in 
github mobile.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to