This is an automated email from the ASF dual-hosted git repository.

abukor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit caa8d6d2fc9988dbd8c6331b2ab14f1a5cf63e56
Author: Attila Bukor <[email protected]>
AuthorDate: Tue Sep 24 14:00:55 2019 +0200

    KUDU-1938 Add support for VARCHAR pt 1
    
    Introduces the VARCHAR data type to the server. Follow up commits will
    add integration to the clients. The VARCHAR type is parameterized with a
    length column type attribute similar to DECIMAL's scale and precision.
    Internally it's stored as BINARY.
    
    The maximum length for VARCHAR is 65,535 characters. If a value longer
    than the "n" is submitted for a VARCHAR(n) column the value is truncated
    to "n" characters before persisting the data on the server side.
    
    The maximum length was chosen for compatibility reasons. Apache Impala
    has a maximum length of 65,535 *bytes* for VARCHAR and major RDBMSs I
    checked also have a similar limits either in characters or bytes, mostly
    configurable.
    
    Change-Id: I998982dba93831db91c43a97ce30d3e68c2a4a54
    Reviewed-on: http://gerrit.cloudera.org:8080/13760
    Reviewed-by: Alexey Serbin <[email protected]>
    Tested-by: Kudu Jenkins
    Reviewed-by: Grant Henke <[email protected]>
    Reviewed-by: Adar Dembo <[email protected]>
---
 src/kudu/common/column_predicate-test.cc | 10 +++++
 src/kudu/common/common.proto             |  3 ++
 src/kudu/common/partial_row-test.cc      | 25 +++++++++++-
 src/kudu/common/partial_row.cc           | 70 ++++++++++++++++++++++++--------
 src/kudu/common/partial_row.h            | 45 +++++++++++++++++---
 src/kudu/common/schema.cc                |  4 ++
 src/kudu/common/schema.h                 | 18 +++++++-
 src/kudu/common/types.cc                 |  1 +
 src/kudu/common/types.h                  | 19 ++++++++-
 src/kudu/common/wire_protocol.cc         |  5 +++
 src/kudu/util/CMakeLists.txt             |  1 +
 src/kudu/util/char_util.cc               | 44 ++++++++++++++++++++
 src/kudu/util/char_util.h                | 39 ++++++++++++++++++
 13 files changed, 256 insertions(+), 28 deletions(-)

diff --git a/src/kudu/common/column_predicate-test.cc 
b/src/kudu/common/column_predicate-test.cc
index 335bf78..15c0ed3 100644
--- a/src/kudu/common/column_predicate-test.cc
+++ b/src/kudu/common/column_predicate-test.cc
@@ -1121,6 +1121,7 @@ TEST_F(TestColumnPredicate, TestLess) {
     ColumnSchema d128("d128", DECIMAL128);
     ColumnSchema string("string", STRING);
     ColumnSchema binary("binary", BINARY);
+    ColumnSchema varchar("varchar", VARCHAR);
 
     ASSERT_EQ(PredicateType::None,
               ColumnPredicate::Range(i8, nullptr, 
TypeTraits<INT8>::min_value())
@@ -1158,6 +1159,9 @@ TEST_F(TestColumnPredicate, TestLess) {
     ASSERT_EQ(PredicateType::None,
               ColumnPredicate::Range(binary, nullptr, 
TypeTraits<BINARY>::min_value())
                               .predicate_type());
+    ASSERT_EQ(PredicateType::None,
+              ColumnPredicate::Range(varchar, nullptr, 
TypeTraits<VARCHAR>::min_value())
+                              .predicate_type());
 }
 
 TEST_F(TestColumnPredicate, TestGreaterThanEquals) {
@@ -1173,6 +1177,7 @@ TEST_F(TestColumnPredicate, TestGreaterThanEquals) {
     ColumnSchema d128("d128", DECIMAL128);
     ColumnSchema string("string", STRING);
     ColumnSchema binary("binary", BINARY);
+    ColumnSchema varchar("varchar", VARCHAR);
 
     ASSERT_EQ(PredicateType::IsNotNull,
               ColumnPredicate::Range(i8, TypeTraits<INT8>::min_value(), 
nullptr)
@@ -1210,6 +1215,9 @@ TEST_F(TestColumnPredicate, TestGreaterThanEquals) {
     ASSERT_EQ(PredicateType::IsNotNull,
               ColumnPredicate::Range(binary, TypeTraits<BINARY>::min_value(), 
nullptr)
                               .predicate_type());
+    ASSERT_EQ(PredicateType::IsNotNull,
+              ColumnPredicate::Range(varchar, 
TypeTraits<VARCHAR>::min_value(), nullptr)
+                              .predicate_type());
 
     ASSERT_EQ(PredicateType::Equality,
               ColumnPredicate::Range(i8, TypeTraits<INT8>::max_value(), 
nullptr)
@@ -1247,6 +1255,8 @@ TEST_F(TestColumnPredicate, TestGreaterThanEquals) {
               ColumnPredicate::Range(string, &s, nullptr).predicate_type());
     ASSERT_EQ(PredicateType::Range,
               ColumnPredicate::Range(binary, &s, nullptr).predicate_type());
+    ASSERT_EQ(PredicateType::Range,
+              ColumnPredicate::Range(varchar, &s, nullptr).predicate_type());
 }
 
 // Test the InList constructor.
diff --git a/src/kudu/common/common.proto b/src/kudu/common/common.proto
index 1982315..8fb6a4a 100644
--- a/src/kudu/common/common.proto
+++ b/src/kudu/common/common.proto
@@ -56,6 +56,7 @@ enum DataType {
   DECIMAL64 = 16;
   DECIMAL128 = 17;
   IS_DELETED = 18; // virtual column; not a real data type
+  VARCHAR = 19;
 }
 
 enum EncodingType {
@@ -94,6 +95,8 @@ message ColumnTypeAttributesPB {
   // For decimal columns
   optional int32 precision = 1;
   optional int32 scale = 2;
+  // For varchar columns
+  optional int32 length = 3;
 }
 
 // TODO: Differentiate between the schema attributes
diff --git a/src/kudu/common/partial_row-test.cc 
b/src/kudu/common/partial_row-test.cc
index 383a090..ecf600e 100644
--- a/src/kudu/common/partial_row-test.cc
+++ b/src/kudu/common/partial_row-test.cc
@@ -43,7 +43,9 @@ class PartialRowTest : public KuduTest {
                 ColumnSchema("string_val", STRING, true),
                 ColumnSchema("binary_val", BINARY, true),
                 ColumnSchema("decimal_val", DECIMAL32, true, nullptr, nullptr,
-                             ColumnStorageAttributes(), 
ColumnTypeAttributes(6, 2)) },
+                             ColumnStorageAttributes(), 
ColumnTypeAttributes(6, 2)),
+                ColumnSchema("varchar_val", VARCHAR, true, nullptr, nullptr,
+                             ColumnStorageAttributes(), 
ColumnTypeAttributes(10)) },
               1) {
     SeedRandom();
   }
@@ -129,6 +131,9 @@ TEST_F(PartialRowTest, UnitTest) {
   EXPECT_FALSE(row.IsColumnSet(0));
   EXPECT_FALSE(row.IsColumnSet(1));
   EXPECT_FALSE(row.IsColumnSet(2));
+  EXPECT_FALSE(row.IsColumnSet(3));
+  EXPECT_FALSE(row.IsColumnSet(4));
+  EXPECT_FALSE(row.IsColumnSet(5));
   EXPECT_FALSE(row.IsKeySet());
   EXPECT_EQ("", row.ToString());
 
@@ -255,6 +260,24 @@ TEST_F(PartialRowTest, UnitTest) {
   // able to set string columns with SetBinary and vice versa.
   EXPECT_FALSE(row.SetBinaryCopy("string_val", "oops").ok());
   EXPECT_FALSE(row.SetStringCopy("binary_val", "oops").ok());
+
+  EXPECT_OK(row.Unset(4));
+
+  s = row.SetVarchar("varchar_val", "shortval");
+  EXPECT_TRUE(row.IsColumnSet(5));
+  EXPECT_EQ("varchar varchar_val=\"shortval\"", row.ToString());
+
+  s = row.SetVarchar("varchar_val", "shortval  value ");
+  EXPECT_EQ("varchar varchar_val=\"shortval  \"", row.ToString());
+
+  s = row.SetVarchar("varchar_val", "this value is too long");
+  EXPECT_EQ("varchar varchar_val=\"this value\"", row.ToString());
+
+  s = row.SetVarchar("varchar_val", "Árvíztűrő tükörfúrógép");
+  EXPECT_EQ("varchar varchar_val=\"Árvíztűrő \"", row.ToString());
+
+  s = row.SetVarchar("varchar_val", "123456789\xF0\x9F\xA6\x8C ABCDEF");
+  EXPECT_EQ("varchar varchar_val=\"123456789\xF0\x9F\xA6\x8C\"", 
row.ToString());
 }
 
 TEST_F(PartialRowTest, TestCopy) {
diff --git a/src/kudu/common/partial_row.cc b/src/kudu/common/partial_row.cc
index 6e924de..f99dd58 100644
--- a/src/kudu/common/partial_row.cc
+++ b/src/kudu/common/partial_row.cc
@@ -18,6 +18,7 @@
 #include "kudu/common/partial_row.h"
 
 #include <cstring>
+#include <ostream>
 #include <string>
 #include <utility>
 
@@ -31,6 +32,7 @@
 #include "kudu/gutil/port.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/util/bitmap.h"
+#include "kudu/util/char_util.h"
 #include "kudu/util/decimal_util.h"
 #include "kudu/util/int128.h"
 #include "kudu/util/logging.h"
@@ -188,6 +190,10 @@ Status KuduPartialRow::Set(int32_t column_idx, const 
uint8_t* val) {
       RETURN_NOT_OK(SetBinaryCopy(column_idx, *reinterpret_cast<const 
Slice*>(val)));
       break;
     }
+    case VARCHAR: {
+      RETURN_NOT_OK(SetVarchar(column_idx, *reinterpret_cast<const 
Slice*>(val)));
+      break;
+    }
     case UNIXTIME_MICROS: {
       RETURN_NOT_OK(SetUnixTimeMicros(column_idx, *reinterpret_cast<const 
int64_t*>(val)));
       break;
@@ -218,11 +224,19 @@ void KuduPartialRow::DeallocateStringIfSet(int col_idx, 
const ColumnSchema& col)
   if (BitmapTest(owned_strings_bitmap_, col_idx)) {
     ContiguousRow row(schema_, row_data_);
     const Slice* dst;
-    if (col.type_info()->type() == BINARY) {
-      dst = schema_->ExtractColumnFromRow<BINARY>(row, col_idx);
-    } else {
-      CHECK(col.type_info()->type() == STRING);
-      dst = schema_->ExtractColumnFromRow<STRING>(row, col_idx);
+    switch (col.type_info()->type()) {
+      case BINARY:
+        dst = schema_->ExtractColumnFromRow<BINARY>(row, col_idx);
+        break;
+      case VARCHAR:
+        dst = schema_->ExtractColumnFromRow<VARCHAR>(row, col_idx);
+        break;
+      case STRING:
+        dst = schema_->ExtractColumnFromRow<STRING>(row, col_idx);
+        break;
+      default:
+        LOG(FATAL) << "Unexpected type " << col.type_info()->type();
+        break;
     }
     delete [] dst->data();
     BitmapClear(owned_strings_bitmap_, col_idx);
@@ -330,12 +344,21 @@ Status KuduPartialRow::SetBinary(const Slice& col_name, 
const Slice& val) {
 Status KuduPartialRow::SetString(const Slice& col_name, const Slice& val) {
   return SetStringCopy(col_name, val);
 }
+Status KuduPartialRow::SetVarchar(const Slice& col_name, const Slice& val) {
+  int col_idx;
+  RETURN_NOT_OK(schema_->FindColumn(col_name, &col_idx));
+  return SetVarchar(col_idx, val);
+}
+
 Status KuduPartialRow::SetBinary(int col_idx, const Slice& val) {
   return SetBinaryCopy(col_idx, val);
 }
 Status KuduPartialRow::SetString(int col_idx, const Slice& val) {
   return SetStringCopy(col_idx, val);
 }
+Status KuduPartialRow::SetVarchar(int col_idx, const Slice& val) {
+  return SetSliceCopy<TypeTraits<VARCHAR> >(col_idx, val);
+}
 
 Status KuduPartialRow::SetBinaryCopy(const Slice& col_name, const Slice& val) {
   return SetSliceCopy<TypeTraits<BINARY> >(col_name, val);
@@ -365,24 +388,29 @@ Status KuduPartialRow::SetStringNoCopy(int col_idx, const 
Slice& val) {
 
 template<typename T>
 Status KuduPartialRow::SetSliceCopy(const Slice& col_name, const Slice& val) {
-  auto relocated = new uint8_t[val.size()];
-  memcpy(relocated, val.data(), val.size());
-  Slice relocated_val(relocated, val.size());
-  Status s = Set<T>(col_name, relocated_val, true);
-  if (!s.ok()) {
-    delete [] relocated;
-  }
-  return s;
+  int col_idx;
+  RETURN_NOT_OK(schema_->FindColumn(col_name, &col_idx));
+  return SetSliceCopy<T>(col_idx, val);
 }
 
 template<typename T>
 Status KuduPartialRow::SetSliceCopy(int col_idx, const Slice& val) {
-  auto relocated = new uint8_t[val.size()];
-  memcpy(relocated, val.data(), val.size());
-  Slice relocated_val(relocated, val.size());
+  auto col = schema_->column(col_idx);
+  Slice relocated_val;
+  switch (T::type) {
+    case VARCHAR:
+      relocated_val = UTF8Truncate(val, col.type_attributes().length);
+      break;
+    case STRING:
+    case BINARY:
+      auto relocated = new uint8_t[val.size()];
+      memcpy(relocated, val.data(), val.size());
+      relocated_val = Slice(relocated, val.size());
+      break;
+  }
   Status s = Set<T>(col_idx, relocated_val, true);
   if (!s.ok()) {
-    delete [] relocated;
+    delete [] relocated_val.data();
   }
   return s;
 }
@@ -656,6 +684,11 @@ Status KuduPartialRow::GetString(const Slice& col_name, 
Slice* val) const {
 Status KuduPartialRow::GetBinary(const Slice& col_name, Slice* val) const {
   return Get<TypeTraits<BINARY> >(col_name, val);
 }
+Status KuduPartialRow::GetVarchar(const Slice& col_name, Slice* val) const {
+  int col_idx;
+  RETURN_NOT_OK(schema_->FindColumn(col_name, &col_idx));
+  return GetVarchar(col_idx, val);
+}
 
 Status KuduPartialRow::GetBool(int col_idx, bool* val) const {
   return Get<TypeTraits<BOOL> >(col_idx, val);
@@ -716,6 +749,9 @@ Status KuduPartialRow::GetString(int col_idx, Slice* val) 
const {
 Status KuduPartialRow::GetBinary(int col_idx, Slice* val) const {
   return Get<TypeTraits<BINARY> >(col_idx, val);
 }
+Status KuduPartialRow::GetVarchar(int col_idx, Slice* val) const {
+  return Get<TypeTraits<VARCHAR> >(col_idx, val);
+}
 
 template<typename T>
 Status KuduPartialRow::Get(const Slice& col_name,
diff --git a/src/kudu/common/partial_row.h b/src/kudu/common/partial_row.h
index 74375c3..ec33879 100644
--- a/src/kudu/common/partial_row.h
+++ b/src/kudu/common/partial_row.h
@@ -166,10 +166,23 @@ class KUDU_EXPORT KuduPartialRow {
   Status SetString(const Slice& col_name, const Slice& val) WARN_UNUSED_RESULT;
   ///@}
 
+  /// @name Setters for varchar columns by name (copying).
+  ///
+  /// Set the varchar value for a column by name, copying the
+  /// specified data immediately.
+  ///
+  /// @param [in] col_name
+  ///   Name of the target column.
+  /// @param [in] val
+  ///   The value to set.
+  /// @return Operation result status.
+  ///
+  Status SetVarchar(const Slice& col_name, const Slice& val) 
WARN_UNUSED_RESULT;
+
   /// @name Setters for binary/string columns by index (copying).
   ///
-  /// Set the binary/string value for a column by index, copying the specified
-  /// data immediately.
+  /// Set the binary/string value for a column by index, copying
+  /// the specified data immediately.
   ///
   /// These setters are the same as the corresponding column-name-based 
setters,
   /// but with numeric column indexes. These are faster since they avoid
@@ -192,6 +205,24 @@ class KUDU_EXPORT KuduPartialRow {
   Status SetString(int col_idx, const Slice& val) WARN_UNUSED_RESULT;
   ///@}
 
+  /// @name Setter for varchar columns by index (copying).
+  ///
+  /// Set the varchar value for a column by index, copying
+  /// the specified data immediately.
+  ///
+  /// These setters are the same as the corresponding column-name-based 
setters,
+  /// but with numeric column indexes. These are faster since they avoid
+  /// hashmap lookups, so should be preferred in performance-sensitive code
+  /// (e.g. bulk loaders).
+  ///
+  /// @param [in] col_idx
+  ///   The index of the target column.
+  /// @param [in] val
+  ///   The value to set.
+  /// @return Operation result status.
+  ///
+  Status SetVarchar(int col_idx, const Slice& val) WARN_UNUSED_RESULT;
+
   /// @name Setters for binary/string columns by name (copying).
   ///
   /// Set the binary/string value for a column by name, copying the specified
@@ -406,9 +437,9 @@ class KUDU_EXPORT KuduPartialRow {
 #endif
   ///@}
 
-  /// @name Getters for string/binary column by column name.
+  /// @name Getters for string/binary/varchar column by column name.
   ///
-  /// Get the string/binary value for a column by its name.
+  /// Get the string/binary/varchar value for a column by its name.
   ///
   /// @param [in] col_name
   ///   Name of the column.
@@ -425,11 +456,12 @@ class KUDU_EXPORT KuduPartialRow {
   ///@{
   Status GetString(const Slice& col_name, Slice* val) const WARN_UNUSED_RESULT;
   Status GetBinary(const Slice& col_name, Slice* val) const WARN_UNUSED_RESULT;
+  Status GetVarchar(const Slice& col_name, Slice* val) const 
WARN_UNUSED_RESULT;
   ///@}
 
-  /// @name Getters for string/binary column by column index.
+  /// @name Getters for string/binary/varchar column by column index.
   ///
-  /// Get the string/binary value for a column by its index.
+  /// Get the string/binary/varchar value for a column by its index.
   ///
   /// These methods are faster than their name-based counterparts
   /// since they use indices to avoid hashmap lookups, so index-based getters
@@ -450,6 +482,7 @@ class KUDU_EXPORT KuduPartialRow {
   ///@{
   Status GetString(int col_idx, Slice* val) const WARN_UNUSED_RESULT;
   Status GetBinary(int col_idx, Slice* val) const WARN_UNUSED_RESULT;
+  Status GetVarchar(int col_idx, Slice* val) const WARN_UNUSED_RESULT;
   ///@}
 
   //------------------------------------------------------------
diff --git a/src/kudu/common/schema.cc b/src/kudu/common/schema.cc
index 74671bd..11a9966 100644
--- a/src/kudu/common/schema.cc
+++ b/src/kudu/common/schema.cc
@@ -87,6 +87,8 @@ bool ColumnTypeAttributes::EqualsForType(ColumnTypeAttributes 
other,
     case DECIMAL64:
     case DECIMAL128:
       return precision == other.precision && scale == other.scale;
+    case VARCHAR:
+      return length == other.length;
     default:
       return true; // true because unhandled types don't use 
ColumnTypeAttributes.
   }
@@ -98,6 +100,8 @@ string ColumnTypeAttributes::ToStringForType(DataType type) 
const {
     case DECIMAL64:
     case DECIMAL128:
       return Substitute("($0, $1)", precision, scale);
+    case VARCHAR:
+      return Substitute("($0)", length);
     default:
       return "";
   }
diff --git a/src/kudu/common/schema.h b/src/kudu/common/schema.h
index 1916868..489bb00 100644
--- a/src/kudu/common/schema.h
+++ b/src/kudu/common/schema.h
@@ -92,12 +92,20 @@ struct ColumnTypeAttributes {
  public:
   ColumnTypeAttributes()
       : precision(0),
-        scale(0) {
+        scale(0),
+        length(0) {
   }
 
   ColumnTypeAttributes(int8_t precision, int8_t scale)
       : precision(precision),
-        scale(scale) {
+        scale(scale),
+        length(0) {
+  }
+
+  explicit ColumnTypeAttributes(uint16_t length)
+      : precision(0),
+        scale(0),
+        length(length) {
   }
 
   // Does `other` represent equivalent attributes for `type`?
@@ -112,6 +120,12 @@ struct ColumnTypeAttributes {
 
   int8_t precision;
   int8_t scale;
+
+  // Maximum value of the length is 65,535 for compatibility reasons as it's
+  // used by VARCHAR type which can be set to a maximum of 65,535 in case of
+  // MySQL and less for other major RDBMS implementations. The length refers to
+  // the number of characters/symbols (not bytes).
+  uint16_t length;
 };
 
 // Class for storing column attributes such as compression and
diff --git a/src/kudu/common/types.cc b/src/kudu/common/types.cc
index a70c425..7572f69 100644
--- a/src/kudu/common/types.cc
+++ b/src/kudu/common/types.cc
@@ -89,6 +89,7 @@ class TypeInfoResolver {
     AddMapping<DECIMAL64>();
     AddMapping<DECIMAL128>();
     AddMapping<IS_DELETED>();
+    AddMapping<VARCHAR>();
   }
 
   template<DataType type> void AddMapping() {
diff --git a/src/kudu/common/types.h b/src/kudu/common/types.h
index 36f9b6f..e77671e 100644
--- a/src/kudu/common/types.h
+++ b/src/kudu/common/types.h
@@ -20,8 +20,8 @@
 
 
 #include <cmath>
-#include <cstdio>
 #include <cstdint>
+#include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <ctime>
@@ -37,7 +37,7 @@
 #include "kudu/gutil/strings/escaping.h"
 #include "kudu/gutil/strings/numbers.h"
 #include "kudu/util/int128.h"
-#include "kudu/util/int128_util.h"
+#include "kudu/util/int128_util.h" // IWYU pragma: keep
 #include "kudu/util/slice.h"
 // IWYU pragma: no_include "kudu/util/status.h"
 
@@ -630,6 +630,19 @@ struct DataTypeTraits<IS_DELETED> : public 
DerivedTypeTraits<BOOL>{
   }
 };
 
+template<>
+struct DataTypeTraits<VARCHAR> : public DerivedTypeTraits<BINARY>{
+  static const char* name() {
+    return "varchar";
+  }
+  static void AppendDebugStringForValue(const void *val, std::string *str) {
+    const Slice *s = reinterpret_cast<const Slice *>(val);
+    str->push_back('"');
+    str->append(strings::Utf8SafeCEscape(s->ToString()));
+    str->push_back('"');
+  }
+};
+
 // Instantiate this template to get static access to the type traits.
 template<DataType datatype>
 struct TypeTraits : public DataTypeTraits<datatype> {
@@ -713,6 +726,7 @@ class Variant {
         numeric_.double_val = *static_cast<const double *>(value);
         break;
       case STRING: // Fallthrough intended.
+      case VARCHAR:
       case BINARY:
         {
           const Slice *str = static_cast<const Slice *>(value);
@@ -779,6 +793,7 @@ class Variant {
       case FLOAT:        return (&numeric_.float_val);
       case DOUBLE:       return (&numeric_.double_val);
       case STRING:
+      case VARCHAR:
       case BINARY:       return &vstr_;
       default: LOG(FATAL) << "Unknown data type: " << type_;
     }
diff --git a/src/kudu/common/wire_protocol.cc b/src/kudu/common/wire_protocol.cc
index 82e67cd..d5f3826 100644
--- a/src/kudu/common/wire_protocol.cc
+++ b/src/kudu/common/wire_protocol.cc
@@ -228,6 +228,8 @@ void ColumnSchemaToPB(const ColumnSchema& col_schema, 
ColumnSchemaPB *pb, int fl
       type == DataType::DECIMAL128) {
     
pb->mutable_type_attributes()->set_precision(col_schema.type_attributes().precision);
     
pb->mutable_type_attributes()->set_scale(col_schema.type_attributes().scale);
+  } else if (type == DataType::VARCHAR) {
+    
pb->mutable_type_attributes()->set_length(col_schema.type_attributes().length);
   }
   if (!(flags & SCHEMA_PB_WITHOUT_STORAGE_ATTRIBUTES)) {
     pb->set_encoding(col_schema.attributes().encoding);
@@ -299,6 +301,9 @@ Status ColumnSchemaFromPB(const ColumnSchemaPB& pb, 
boost::optional<ColumnSchema
     if (typeAttributesPB.has_scale()) {
       type_attributes.scale = typeAttributesPB.scale();
     }
+    if (typeAttributesPB.has_length()) {
+      type_attributes.length = typeAttributesPB.length();
+    }
   }
 
   ColumnStorageAttributes attributes;
diff --git a/src/kudu/util/CMakeLists.txt b/src/kudu/util/CMakeLists.txt
index be764c5..cd184ec 100644
--- a/src/kudu/util/CMakeLists.txt
+++ b/src/kudu/util/CMakeLists.txt
@@ -150,6 +150,7 @@ set(UTIL_SRCS
   block_cache_metrics.cc
   bloom_filter.cc
   cache.cc
+  char_util.cc
   coding.cc
   condition_variable.cc
   cow_object.cc
diff --git a/src/kudu/util/char_util.cc b/src/kudu/util/char_util.cc
new file mode 100644
index 0000000..606d421
--- /dev/null
+++ b/src/kudu/util/char_util.cc
@@ -0,0 +1,44 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "kudu/util/char_util.h"
+
+#include <string.h>
+
+namespace kudu {
+
+Slice UTF8Truncate(Slice val, size_t max_utf8_length) {
+  size_t num_utf8_chars = 0;
+  size_t num_bytes = 0;
+  auto str = val.data();
+  for (auto i = 0; i < val.size(); ++i) {
+    num_utf8_chars += (*str++ & 0xc0) != 0x80;
+    num_bytes++;
+    if (num_utf8_chars > max_utf8_length) {
+      num_bytes--;
+      num_utf8_chars--;
+      break;
+    }
+  }
+  // as num_bytes <= val.size() we can use that to allocate the new slice data
+  // and copy the first num_bytes from val.data() to it.
+  auto relocated = new uint8_t[num_bytes];
+  memcpy(relocated, val.data(), num_bytes);
+  return Slice(relocated, num_bytes);
+}
+
+} // namespace kudu
diff --git a/src/kudu/util/char_util.h b/src/kudu/util/char_util.h
new file mode 100644
index 0000000..9fa0338
--- /dev/null
+++ b/src/kudu/util/char_util.h
@@ -0,0 +1,39 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+
+#include "kudu/util/slice.h"
+
+namespace kudu {
+
+  // Minimum and maxium length for VARCHAR [1,65535]
+  constexpr uint16_t kMinVarcharLength = 1;
+  constexpr uint16_t kMaxVarcharLength = std::numeric_limits<uint16_t>::max();
+
+  // Copy and truncate a slice. The Slice returned owns its memory.
+  //
+  // max_utf8_length is the number of UTF-8 characters/symbols (not bytes) to
+  // truncate to.
+  //
+  // The method doesn't validate the string is well-formed UTF-8.
+  Slice UTF8Truncate(Slice val, size_t max_utf8_length);
+} // namespace kudu

Reply via email to