Repository: incubator-airflow Updated Branches: refs/heads/master 7c233179e -> 648b14b4d
[AIRFLOW-2452] Document field_dict must be OrderedDict HiveCliHook.load_file has a parameter called field_dict, which defines name-type pairs for columns, must be OrderedDict so as to keep columns' order, but it's undocumented. This PR adds an note about that, and fixes HiveCliHook.load_df function which calls load_file internally. Closes #3347 from sekikn/AIRFLOW-2452 Project: http://git-wip-us.apache.org/repos/asf/incubator-airflow/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-airflow/commit/648b14b4 Tree: http://git-wip-us.apache.org/repos/asf/incubator-airflow/tree/648b14b4 Diff: http://git-wip-us.apache.org/repos/asf/incubator-airflow/diff/648b14b4 Branch: refs/heads/master Commit: 648b14b4d95bf3aca26e8b54ffe8585b52efc8fd Parents: 7c23317 Author: Kengo Seki <sek...@apache.org> Authored: Tue May 15 10:53:29 2018 -0700 Committer: r39132 <siddharthan...@yahoo.com> Committed: Tue May 15 10:53:29 2018 -0700 ---------------------------------------------------------------------- airflow/hooks/hive_hooks.py | 16 +++++++++++----- tests/hooks/test_hive_hook.py | 2 ++ 2 files changed, 13 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-airflow/blob/648b14b4/airflow/hooks/hive_hooks.py ---------------------------------------------------------------------- diff --git a/airflow/hooks/hive_hooks.py b/airflow/hooks/hive_hooks.py index 6c9a907..92958c0 100644 --- a/airflow/hooks/hive_hooks.py +++ b/airflow/hooks/hive_hooks.py @@ -27,6 +27,7 @@ import itertools import re import subprocess import time +from collections import OrderedDict from tempfile import NamedTemporaryFile import hmsclient @@ -303,8 +304,9 @@ class HiveCliHook(BaseHook): :param recreate: whether to drop and recreate the table at every execution :type recreate: bool - :param field_dict: mapping from column name to hive data type - :type field_dict: dict + :param field_dict: mapping from column name to hive data type. + Note that it must be OrderedDict so as to keep columns' order. + :type field_dict: OrderedDict :param encoding: string encoding to use when writing DataFrame to file :type encoding: str :param pandas_kwargs: passed to DataFrame.to_csv @@ -325,7 +327,10 @@ class HiveCliHook(BaseHook): 'V': 'STRING' # void } - return dict((col, DTYPE_KIND_HIVE_TYPE[dtype.kind]) for col, dtype in df.dtypes.iteritems()) + d = OrderedDict() + for col, dtype in df.dtypes.iteritems(): + d[col] = DTYPE_KIND_HIVE_TYPE[dtype.kind] + return d if pandas_kwargs is None: pandas_kwargs = {} @@ -378,8 +383,9 @@ class HiveCliHook(BaseHook): :param delimiter: field delimiter in the file :type delimiter: str :param field_dict: A dictionary of the fields name in the file - as keys and their Hive types as values - :type field_dict: dict + as keys and their Hive types as values. + Note that it must be OrderedDict so as to keep columns' order. + :type field_dict: OrderedDict :param create: whether to create the table if it doesn't exist :type create: bool :param overwrite: whether to overwrite the data in table or partition http://git-wip-us.apache.org/repos/asf/incubator-airflow/blob/648b14b4/tests/hooks/test_hive_hook.py ---------------------------------------------------------------------- diff --git a/tests/hooks/test_hive_hook.py b/tests/hooks/test_hive_hook.py index 9eda9e3..3c278e3 100644 --- a/tests/hooks/test_hive_hook.py +++ b/tests/hooks/test_hive_hook.py @@ -25,6 +25,7 @@ import random import mock import unittest +from collections import OrderedDict from hmsclient import HMSClient from airflow.exceptions import AirflowException @@ -130,6 +131,7 @@ class TestHiveCliHook(unittest.TestCase): kwargs = mock_load_file.call_args[1] self.assertEqual(kwargs["delimiter"], delimiter) self.assertEqual(kwargs["field_dict"], {"c": u"STRING"}) + self.assertTrue(isinstance(kwargs["field_dict"], OrderedDict)) self.assertEqual(kwargs["table"], table)