This is an automated email from the ASF dual-hosted git repository. abukor pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/kudu.git
commit 8c069b3f9d9ac6cb664e0f30b879461d6b40efee Author: Attila Bukor <[email protected]> AuthorDate: Wed Dec 11 17:06:57 2019 -0800 [python] KUDU-1938 Add VARCHAR support Change-Id: I27b2a85c9caa3f026f4fb0ab8974899babb01dac Reviewed-on: http://gerrit.cloudera.org:8080/14879 Reviewed-by: Alexey Serbin <[email protected]> Tested-by: Kudu Jenkins --- python/kudu/__init__.py | 2 +- python/kudu/client.pyx | 18 +++++++++++++- python/kudu/libkudu_client.pxd | 13 ++++++++++ python/kudu/schema.pyx | 44 +++++++++++++++++++++++++++++----- python/kudu/tests/test_scanner.py | 9 +++++-- python/kudu/tests/test_scantoken.py | 4 ++++ python/kudu/tests/test_schema.py | 48 +++++++++++++++++++++++++++++++++++++ python/kudu/tests/util.py | 17 +++++++++---- 8 files changed, 141 insertions(+), 14 deletions(-) diff --git a/python/kudu/__init__.py b/python/kudu/__init__.py index 1146f05..989eea1 100644 --- a/python/kudu/__init__.py +++ b/python/kudu/__init__.py @@ -39,7 +39,7 @@ from kudu.errors import (KuduException, KuduBadStatus, KuduNotFound, # noqa from kudu.schema import (int8, int16, int32, int64, string_ as string, # noqa double_ as double, float_, float_ as float, binary, - unixtime_micros, bool_ as bool, decimal, + unixtime_micros, bool_ as bool, decimal, varchar, KuduType, SchemaBuilder, ColumnSpec, Schema, ColumnSchema, COMPRESSION_DEFAULT, diff --git a/python/kudu/client.pyx b/python/kudu/client.pyx index a37bd93..368d170 100644 --- a/python/kudu/client.pyx +++ b/python/kudu/client.pyx @@ -85,7 +85,8 @@ cdef dict _type_names = { KUDU_DOUBLE : "KUDU_DOUBLE", KUDU_BINARY : "KUDU_BINARY", KUDU_UNIXTIME_MICROS : "KUDU_UNIXTIME_MICROS", - KUDU_DECIMAL : "KUDU_DECIMAL" + KUDU_DECIMAL : "KUDU_DECIMAL", + KUDU_VARCHAR : "KUDU_VARCHAR" } # Range Partition Bound Type enums @@ -1545,6 +1546,12 @@ cdef class Row: scale = self.parent.batch.projection_schema().Column(i).type_attributes().scale() return from_unscaled_decimal(self.__get_unscaled_decimal(i), scale) + cdef inline get_varchar(self, int i): + cdef Slice val + check_status(self.row.GetVarchar(i, &val)) + return cpython.PyBytes_FromStringAndSize(<char*> val.mutable_data(), + val.size()) + cdef inline get_slot(self, int i): cdef: Status s @@ -1572,6 +1579,8 @@ cdef class Row: return from_unixtime_micros(self.get_unixtime_micros(i)) elif t == KUDU_DECIMAL: return self.get_decimal(i) + elif t == KUDU_VARCHAR: + return frombytes(self.get_varchar(i)) else: raise TypeError("Cannot get kudu type <{0}>" .format(_type_names[t])) @@ -2770,6 +2779,11 @@ cdef class PartialRow: slc = Slice(<char*> value, len(value)) check_status(self.row.SetBinaryCopy(i, slc)) + elif t == KUDU_VARCHAR: + if isinstance(value, unicode): + value = value.encode('utf8') + slc = Slice(<char*> value, len(value)) + check_status(self.row.SetVarchar(i, slc)) elif t == KUDU_UNIXTIME_MICROS: check_status(self.row.SetUnixTimeMicros(i, <int64_t> to_unixtime_micros(value))) @@ -2884,6 +2898,8 @@ cdef inline cast_pyvalue(DataType t, object o): return UnixtimeMicrosVal(o) elif t == KUDU_BINARY: return StringVal(o) + elif t == KUDU_VARCHAR: + return StringVal(o) else: raise TypeError("Cannot cast kudu type <{0}>".format(_type_names[t])) diff --git a/python/kudu/libkudu_client.pxd b/python/kudu/libkudu_client.pxd index a34d6c2..2c65a26 100644 --- a/python/kudu/libkudu_client.pxd +++ b/python/kudu/libkudu_client.pxd @@ -125,6 +125,7 @@ cdef extern from "kudu/client/schema.h" namespace "kudu::client" nogil: KUDU_BINARY " kudu::client::KuduColumnSchema::BINARY" KUDU_UNIXTIME_MICROS " kudu::client::KuduColumnSchema::UNIXTIME_MICROS" KUDU_DECIMAL " kudu::client::KuduColumnSchema::DECIMAL" + KUDU_VARCHAR " kudu::client::KuduColumnSchema::VARCHAR" enum EncodingType" kudu::client::KuduColumnStorageAttributes::EncodingType": EncodingType_AUTO " kudu::client::KuduColumnStorageAttributes::AUTO_ENCODING" @@ -152,9 +153,11 @@ cdef extern from "kudu/client/schema.h" namespace "kudu::client" nogil: KuduColumnTypeAttributes() KuduColumnTypeAttributes(const KuduColumnTypeAttributes& other) KuduColumnTypeAttributes(int8_t precision, int8_t scale) + KuduColumnTypeAttributes(uint16_t length) int8_t precision() int8_t scale() + uint16_t length() c_bool Equals(KuduColumnTypeAttributes& other) void CopyFrom(KuduColumnTypeAttributes& other) @@ -203,6 +206,7 @@ cdef extern from "kudu/client/schema.h" namespace "kudu::client" nogil: KuduColumnSpec* Precision(int8_t precision); KuduColumnSpec* Scale(int8_t scale); + KuduColumnSpec* Length(uint16_t length); KuduColumnSpec* RenameTo(const string& new_name) @@ -264,6 +268,9 @@ cdef extern from "kudu/client/scan_batch.h" namespace "kudu::client" nogil: Status GetBinary(const Slice& col_name, Slice* val) Status GetBinary(int col_idx, Slice* val) + Status GetVarchar(const Slice& col_name, Slice* val) + Status GetVarchar(int col_idx, Slice* val) + const void* cell(int col_idx) string ToString() @@ -359,6 +366,9 @@ cdef extern from "kudu/common/partial_row.h" namespace "kudu" nogil: Status SetBinaryCopy(const Slice& col_name, const Slice& val) Status SetBinaryCopy(int col_idx, const Slice& val) + Status SetVarchar(Slice& col_name, Slice& val) + Status SetVarchar(int col_idx, Slice& val) + Status SetNull(Slice& col_name) Status SetNull(int col_idx) @@ -410,6 +420,9 @@ cdef extern from "kudu/common/partial_row.h" namespace "kudu" nogil: Status GetBinary(const Slice& col_name, Slice* val) Status GetBinary(int col_idx, Slice* val) + Status GetVarchar(Slice& col_name, Slice* val) + Status GetVarchar(int col_idx, Slice* val) + Status EncodeRowKey(string* encoded_key) string ToEncodedRowKeyOrDie() diff --git a/python/kudu/schema.pyx b/python/kudu/schema.pyx index 8a3f9ee..35f7192 100644 --- a/python/kudu/schema.pyx +++ b/python/kudu/schema.pyx @@ -35,6 +35,7 @@ from . import util BOOL = KUDU_BOOL STRING = KUDU_STRING +VARCHAR = KUDU_VARCHAR INT8 = KUDU_INT8 INT16 = KUDU_INT16 @@ -123,6 +124,7 @@ double_ = KuduType(KUDU_DOUBLE) binary = KuduType(KUDU_BINARY) unixtime_micros = KuduType(KUDU_UNIXTIME_MICROS) decimal = KuduType(KUDU_DECIMAL) +varchar = KuduType(KUDU_VARCHAR) cdef dict _type_names = { @@ -136,7 +138,8 @@ cdef dict _type_names = { DOUBLE: 'double', BINARY: 'binary', UNIXTIME_MICROS: 'unixtime_micros', - DECIMAL: 'decimal' + DECIMAL: 'decimal', + VARCHAR: 'varchar' } @@ -153,7 +156,8 @@ cdef dict _type_to_obj = { DOUBLE: double_, BINARY: binary, UNIXTIME_MICROS: unixtime_micros, - DECIMAL: decimal + DECIMAL: decimal, + VARCHAR: varchar } @@ -171,9 +175,11 @@ cdef cppclass KuduColumnTypeAttributes: KuduColumnTypeAttributes() KuduColumnTypeAttributes(const KuduColumnTypeAttributes& other) KuduColumnTypeAttributes(int8_t precision, int8_t scale) + KuduColumnTypeAttributes(uint16_t length) int8_t precision() int8_t scale() + uint16_t length() c_bool Equals(KuduColumnTypeAttributes& other) void CopyFrom(KuduColumnTypeAttributes& other) @@ -197,10 +203,15 @@ cdef class ColumnTypeAttributes: def __get__(self): return self.type_attributes.scale() + property length: + def __get__(self): + return self.type_attributes.length() + def __repr__(self): - return ('ColumnTypeAttributes(precision=%s, scale=%s)' + return ('ColumnTypeAttributes(precision=%s, scale=%s, length=%s)' % (self.type_attributes.precision(), - self.type_attributes.scale())) + self.type_attributes.scale(), + self.type_attributes.length())) cdef class ColumnSchema: """ @@ -382,6 +393,22 @@ cdef class ColumnSpec: self.spec.Scale(scale) return self + def length(self, length): + """ + Set the length for the column. + + Clients can specify a length for varchar columns. Length is the maximum + length in characters (UTF-8) of the string that the varchar can hold. + + The length must be between 1 and 65,535 (inclusive). + + Returns + ------- + self + """ + self.spec.Length(length) + return self + def primary_key(self): """ Make this column a primary key. If you use this method, it will be the @@ -473,7 +500,7 @@ cdef class SchemaBuilder: def add_column(self, name, type_=None, nullable=None, compression=None, encoding=None, primary_key=False, block_size=None, - default=None, precision=None, scale=None): + default=None, precision=None, scale=None, length=None): """ Add a new column to the schema. Returns a ColumnSpec object for further configuration and use in a fluid programming style. @@ -502,6 +529,8 @@ cdef class SchemaBuilder: Use this precision for the decimal column scale : int Use this scale for the decimal column + length : int + Use this length for the varchar column Examples -------- @@ -537,6 +566,9 @@ cdef class SchemaBuilder: if scale is not None: result.scale(scale) + if length is not None: + result.length(length) + if primary_key: result.primary_key() @@ -749,7 +781,7 @@ cdef class KuduValue: if (type_.name[:3] == 'int'): self._value = C_KuduValue.FromInt(value) - elif (type_.name in ['string', 'binary']): + elif (type_.name in ['string', 'binary', 'varchar']): if isinstance(value, unicode): value = value.encode('utf8') diff --git a/python/kudu/tests/test_scanner.py b/python/kudu/tests/test_scanner.py index 4bedf96..b7e21b6 100644 --- a/python/kudu/tests/test_scanner.py +++ b/python/kudu/tests/test_scanner.py @@ -312,6 +312,9 @@ class TestScanner(TestScanBase): # Test a binary predicate self._test_binary_pred() + def test_varchar_pred(self): + self._test_varchar_pred() + def test_scan_selection(self): """ This test confirms that setting the scan selection policy on the @@ -348,7 +351,8 @@ class TestScanner(TestScanBase): self.assertEqual(types[5], np.float64) self.assertEqual(types[6], np.int8) self.assertEqual(types[7], np.object) - self.assertEqual(types[8], np.float32) + self.assertEqual(types[8], np.object) + self.assertEqual(types[9], np.float32) else: self.assertEqual(types[0], np.int64) self.assertEqual(types[1], 'datetime64[ns, UTC]') @@ -357,7 +361,8 @@ class TestScanner(TestScanBase): self.assertEqual(types[4], np.float64) self.assertEqual(types[5], np.int8) self.assertEqual(types[6], np.object) - self.assertEqual(types[7], np.float32) + self.assertEqual(types[7], np.object) + self.assertEqual(types[8], np.float32) @pytest.mark.skipif(not (kudu.CLIENT_SUPPORTS_PANDAS), reason="Pandas required to run this test.") diff --git a/python/kudu/tests/test_scantoken.py b/python/kudu/tests/test_scantoken.py index 37b273d..cfea352 100644 --- a/python/kudu/tests/test_scantoken.py +++ b/python/kudu/tests/test_scantoken.py @@ -268,6 +268,10 @@ class TestScanToken(TestScanBase): # Test a binary predicate self._test_binary_pred() + def test_varchar_pred(self): + # Test a varchar predicate + self._test_varchar_pred() + def test_scan_selection(self): """ This test confirms that setting the scan selection policy on the diff --git a/python/kudu/tests/test_schema.py b/python/kudu/tests/test_schema.py index 4870ab7..1b81bbb 100644 --- a/python/kudu/tests/test_schema.py +++ b/python/kudu/tests/test_schema.py @@ -182,6 +182,54 @@ class TestSchema(unittest.TestCase): with self.assertRaises(kudu.KuduInvalidArgument): builder.build() + def test_varchar(self): + builder = kudu.schema_builder() + (builder.add_column('key') + .type('varchar') + .primary_key() + .nullable(False) + .length(10)) + schema = builder.build() + + column = schema[0] + tp = column.type + assert tp.name == 'varchar' + assert tp.type == kudu.schema.VARCHAR + ta = column.type_attributes + assert ta.length == 10 + + def test_varchar_without_length(self): + builder = kudu.schema_builder() + (builder.add_column('key') + .type('varchar') + .primary_key() + .nullable(False)) + + with self.assertRaises(kudu.KuduInvalidArgument): + builder.build() + + def test_varchar_invalid_length(self): + builder = kudu.schema_builder() + (builder.add_column('key') + .type('varchar') + .primary_key() + .length(0) + .nullable(False)) + + with self.assertRaises(kudu.KuduInvalidArgument): + builder.build() + + def test_length_on_non_varchar_column(self): + builder = kudu.schema_builder() + (builder.add_column('key') + .type('decimal') + .primary_key() + .nullable(False) + .length(10)) + + with self.assertRaises(kudu.KuduInvalidArgument): + builder.build() + def test_unsupported_col_spec_methods_for_create_table(self): builder = kudu.schema_builder() builder.add_column('test', 'int64').rename('test') diff --git a/python/kudu/tests/util.py b/python/kudu/tests/util.py index 6823b3a..62f94d2 100644 --- a/python/kudu/tests/util.py +++ b/python/kudu/tests/util.py @@ -72,6 +72,7 @@ class TestScanBase(KuduTestBase, unittest.TestCase): builder.add_column('double_val', type_=kudu.double) builder.add_column('int8_val', type_=kudu.int8) builder.add_column('binary_val', type_='binary', compression=kudu.COMPRESSION_SNAPPY, encoding='prefix') + builder.add_column('varchar_val', type_=kudu.varchar, length=10) builder.add_column('float_val', type_=kudu.float) builder.set_primary_keys(['key', 'unixtime_micros_val']) schema = builder.build() @@ -103,22 +104,22 @@ class TestScanBase(KuduTestBase, unittest.TestCase): (1, datetime.datetime(2016, 1, 1).replace(tzinfo=pytz.utc), Decimal('111.11'), "Test One", True, 1.7976931348623157 * (10^308), 127, b'\xce\x99\xce\xbf\xcf\x81\xce\xb4\xce\xb1\xce\xbd\xce\xaf\xce\xb1', - 3.402823 * (10^38)), + "Test One", 3.402823 * (10^38)), (2, datetime.datetime.utcnow().replace(tzinfo=pytz.utc), Decimal('0.99'), "测试二", False, 200.1, -1, b'\xd0\x98\xd0\xbe\xd1\x80\xd0\xb4\xd0\xb0\xd0\xbd\xd0\xb8\xd1\x8f', - -150.2) + "测试二", -150.2) ] else: self.type_test_rows = [ (1, datetime.datetime(2016, 1, 1).replace(tzinfo=pytz.utc), "Test One", True, 1.7976931348623157 * (10 ^ 308), 127, b'\xce\x99\xce\xbf\xcf\x81\xce\xb4\xce\xb1\xce\xbd\xce\xaf\xce\xb1', - 3.402823 * (10 ^ 38)), + "Test One", 3.402823 * (10 ^ 38)), (2, datetime.datetime.utcnow().replace(tzinfo=pytz.utc), "测试二", False, 200.1, -1, b'\xd0\x98\xd0\xbe\xd1\x80\xd0\xb4\xd0\xb0\xd0\xbd\xd0\xb8\xd1\x8f', - -150.2) + "测试二", -150.2) ] session = self.client.new_session() for row in self.type_test_rows: @@ -255,3 +256,11 @@ class TestScanBase(KuduTestBase, unittest.TestCase): ], row_indexes=slice(1, 2) ) + + def _test_varchar_pred(self): + self.verify_pred_type_scans( + preds=[ + self.type_table['varchar_val'] == 'Test One' + ], + row_indexes=slice(0, 1) + )
