[ 
https://issues.apache.org/jira/browse/ARROW-2422?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16439186#comment-16439186
 ] 

ASF GitHub Bot commented on ARROW-2422:
---------------------------------------

jneuff commented on a change in pull request #1861: ARROW-2422 Support more 
operators for partition filtering
URL: https://github.com/apache/arrow/pull/1861#discussion_r181675580
 
 

 ##########
 File path: python/pyarrow/tests/test_parquet.py
 ##########
 @@ -997,40 +997,159 @@ def test_read_partitioned_directory(tmpdir):
 
 
 @parquet
-def test_read_partitioned_directory_filtered(tmpdir):
-    fs = LocalFileSystem.get_instance()
-    base_path = str(tmpdir)
-
-    import pyarrow.parquet as pq
-
-    foo_keys = [0, 1]
-    bar_keys = ['a', 'b', 'c']
-    partition_spec = [
-        ['foo', foo_keys],
-        ['bar', bar_keys]
-    ]
-    N = 30
-
-    df = pd.DataFrame({
-        'index': np.arange(N),
-        'foo': np.array(foo_keys, dtype='i4').repeat(15),
-        'bar': np.tile(np.tile(np.array(bar_keys, dtype=object), 5), 2),
-        'values': np.random.randn(N)
-    }, columns=['index', 'foo', 'bar', 'values'])
-
-    _generate_partition_directories(fs, base_path, partition_spec, df)
-
-    dataset = pq.ParquetDataset(
-        base_path, filesystem=fs,
-        filters=[('foo', '=', 1), ('bar', '!=', 'b')]
+class TestParquetFilter:
+
+    def test_equivalency(tmpdir):
+        fs = LocalFileSystem.get_instance()
+        base_path = str(tmpdir)
+
+        import pyarrow.parquet as pq
+
+        integer_keys = [0, 1]
+        string_keys = ['a', 'b', 'c']
+        boolean_keys = [True, False]
+        partition_spec = [
+            ['integer', integer_keys],
+            ['string', string_keys],
+            ['boolean', boolean_keys]
+        ]
+        N = 30
+
+        df = pd.DataFrame({
+            'index': np.arange(N),
+            'integer': np.array(integer_keys, dtype='i4').repeat(15),
+            'string': np.tile(np.tile(np.array(string_keys, dtype=object), 5), 
2),
+            'boolean': np.tile(np.tile(np.array(boolean_keys, dtype='bool'), 
5), 3),
+        }, columns=['index', 'integer', 'string', 'boolean'])
+
+        _generate_partition_directories(fs, base_path, partition_spec, df)
+
+        dataset = pq.ParquetDataset(
+            base_path, filesystem=fs,
+            filters=[('integer', '=', 1), ('string', '!=', 'b'), ('boolean', 
'==', True)]
+        )
+        table = dataset.read()
+        result_df = (table.to_pandas()
+                     .sort_values(by='index')
+                     .reset_index(drop=True))
+
+        assert 0 not in result_df['integer'].values
+        assert 'b' not in result_df['string'].values
+        assert False not in result_df['boolean'].values
+
+    def test_cutoff_exclusive_integer(tmpdir):
+        fs = LocalFileSystem.get_instance()
+        base_path = str(tmpdir)
+
+        import pyarrow.parquet as pq
+
+        integer_keys = [0, 1, 2, 3, 4]
+        partition_spec = [
+            ['integers', integer_keys],
+        ]
+        N = 5
+
+        df = pd.DataFrame({
+            'index': np.arange(N),
+            'integers': np.array(integer_keys, dtype='i4'),
+        }, columns=['index', 'integers'])
+
+        _generate_partition_directories(fs, base_path, partition_spec, df)
+
+        dataset = pq.ParquetDataset(
+            base_path, filesystem=fs,
+            filters=[
+                ('integers', '<', 4),
+                ('integers', '>', 1),
+            ]
+        )
+        table = dataset.read()
+        result_df = (table.to_pandas()
+                     .sort_values(by='index')
+                     .reset_index(drop=True))
+
+        result_list = [x for x in map(int, result_df['integers'].values)]
+        assert result_list == [2, 3]
+
+    @pytest.mark.xfail(
+        raises=TypeError, reason='We suspect loss of type information in 
creation of categoricals.'
     )
 
 Review comment:
   @xhochy This is the behavior we just told you about offline. 
`result_df['dates'].values` seems to be of type `object` instead of 
`datetime64`.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> Support more filter operators on Hive partitioned Parquet files
> ---------------------------------------------------------------
>
>                 Key: ARROW-2422
>                 URL: https://issues.apache.org/jira/browse/ARROW-2422
>             Project: Apache Arrow
>          Issue Type: Improvement
>            Reporter: Julius Neuffer
>            Priority: Minor
>              Labels: features, pull-request-available
>
> After implementing basic filters ('=', '!=') on Hive partitioned Parquet 
> files (ARROW-2401), I'll extend them ('>', '<', '<=', '>=') with a new PR on 
> Github.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to