spark git commit: [SPARK-23360][SQL][PYTHON] Get local timezone from environment via pytz, or dateutil.

gurwls223 Sat, 10 Feb 2018 08:08:25 -0800

Repository: spark
Updated Branches:
  refs/heads/master 6d7c38330 -> 97a224a85



[SPARK-23360][SQL][PYTHON] Get local timezone from environment via pytz, or 
dateutil.

## What changes were proposed in this pull request?

Currently we use `tzlocal()` to get Python local timezone, but it sometimes 
causes unexpected behavior.
I changed the way to get Python local timezone to use pytz if the timezone is 
specified in environment variable, or timezone file via dateutil .

## How was this patch tested?

Added a test and existing tests.

Author: Takuya UESHIN <ues...@databricks.com>

Closes #20559 from ueshin/issues/SPARK-23360/master.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/97a224a8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/97a224a8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/97a224a8

Branch: refs/heads/master
Commit: 97a224a855c4410b2dfb9c0bcc6aae583bd28e92
Parents: 6d7c383
Author: Takuya UESHIN <ues...@databricks.com>
Authored: Sun Feb 11 01:08:02 2018 +0900
Committer: hyukjinkwon <gurwls...@gmail.com>
Committed: Sun Feb 11 01:08:02 2018 +0900

----------------------------------------------------------------------
 python/pyspark/sql/tests.py | 28 ++++++++++++++++++++++++++++
 python/pyspark/sql/types.py | 23 +++++++++++++++++++----
 2 files changed, 47 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/97a224a8/python/pyspark/sql/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 6ace169..1087c3f 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -2868,6 +2868,34 @@ class SQLTests(ReusedSQLTestCase):
                                     "d": [pd.Timestamp.now().date()]})
                 self.spark.createDataFrame(pdf)
 
+    # Regression test for SPARK-23360
+    @unittest.skipIf(not _have_pandas, _pandas_requirement_message)
+    def test_create_dateframe_from_pandas_with_dst(self):
+        import pandas as pd
+        from datetime import datetime
+
+        pdf = pd.DataFrame({'time': [datetime(2015, 10, 31, 22, 30)]})
+
+        df = self.spark.createDataFrame(pdf)
+        self.assertPandasEqual(pdf, df.toPandas())
+
+        orig_env_tz = os.environ.get('TZ', None)
+        orig_session_tz = self.spark.conf.get('spark.sql.session.timeZone')
+        try:
+            tz = 'America/Los_Angeles'
+            os.environ['TZ'] = tz
+            time.tzset()
+            self.spark.conf.set('spark.sql.session.timeZone', tz)
+
+            df = self.spark.createDataFrame(pdf)
+            self.assertPandasEqual(pdf, df.toPandas())
+        finally:
+            del os.environ['TZ']
+            if orig_env_tz is not None:
+                os.environ['TZ'] = orig_env_tz
+            time.tzset()
+            self.spark.conf.set('spark.sql.session.timeZone', orig_session_tz)
+
 
 class HiveSparkSubmitTests(SparkSubmitTests):
 

http://git-wip-us.apache.org/repos/asf/spark/blob/97a224a8/python/pyspark/sql/types.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 093dae5..2599dc5 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -1709,6 +1709,21 @@ def _check_dataframe_convert_date(pdf, schema):
     return pdf
 
 
+def _get_local_timezone():
+    """ Get local timezone using pytz with environment variable, or dateutil.
+
+    If there is a 'TZ' environment variable, pass it to pandas to use pytz and 
use it as timezone
+    string, otherwise use the special word 'dateutil/:' which means that 
pandas uses dateutil and
+    it reads system configuration to know the system local timezone.
+
+    See also:
+    - https://github.com/pandas-dev/pandas/blob/0.19.x/pandas/tslib.pyx#L1753
+    - https://github.com/dateutil/dateutil/blob/2.6.1/dateutil/tz/tz.py#L1338
+    """
+    import os
+    return os.environ.get('TZ', 'dateutil/:')
+
+
 def _check_dataframe_localize_timestamps(pdf, timezone):
     """
     Convert timezone aware timestamps to timezone-naive in the specified 
timezone or local timezone
@@ -1721,7 +1736,7 @@ def _check_dataframe_localize_timestamps(pdf, timezone):
     require_minimum_pandas_version()
 
     from pandas.api.types import is_datetime64tz_dtype
-    tz = timezone or 'tzlocal()'
+    tz = timezone or _get_local_timezone()
     for column, series in pdf.iteritems():
         # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
         if is_datetime64tz_dtype(series.dtype):
@@ -1744,7 +1759,7 @@ def _check_series_convert_timestamps_internal(s, 
timezone):
     from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
     # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
     if is_datetime64_dtype(s.dtype):
-        tz = timezone or 'tzlocal()'
+        tz = timezone or _get_local_timezone()
         return s.dt.tz_localize(tz).dt.tz_convert('UTC')
     elif is_datetime64tz_dtype(s.dtype):
         return s.dt.tz_convert('UTC')
@@ -1766,8 +1781,8 @@ def _check_series_convert_timestamps_localize(s, 
from_timezone, to_timezone):
 
     import pandas as pd
     from pandas.api.types import is_datetime64tz_dtype, is_datetime64_dtype
-    from_tz = from_timezone or 'tzlocal()'
-    to_tz = to_timezone or 'tzlocal()'
+    from_tz = from_timezone or _get_local_timezone()
+    to_tz = to_timezone or _get_local_timezone()
     # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
     if is_datetime64tz_dtype(s.dtype):
         return s.dt.tz_convert(to_tz).dt.tz_localize(None)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-23360][SQL][PYTHON] Get local timezone from environment via pytz, or dateutil.

Reply via email to