This is an automated email from the ASF dual-hosted git repository.

ycai pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/cassandra.git


The following commit(s) were added to refs/heads/trunk by this push:
     new f32475a  Fix incorrect encoding for strings can be UTF8
f32475a is described below

commit f32475a839e01e4eea3989871d293d70e8a360d7
Author: Yifan Cai <[email protected]>
AuthorDate: Mon Feb 22 13:39:16 2021 -0800

    Fix incorrect encoding for strings can be UTF8
    
    patch by Yifan Cai; reviewed by Ekaterina Dimitrova for CASSANDRA-16429
---
 CHANGES.txt                                        |  1 +
 doc/source/cql/definitions.rst                     |  6 +++--
 src/java/org/apache/cassandra/cql3/ResultSet.java  |  8 +++---
 .../org/apache/cassandra/transport/DataType.java   |  8 +++---
 .../apache/cassandra/transport/SerDeserTest.java   | 30 +++++++++++++++++++---
 5 files changed, 40 insertions(+), 13 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 6d311eb..4d3372b 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,5 @@
 4.0-beta5
+ * Fix incorrect encoding for strings can be UTF8 (CASSANDRA-16429)
  * Fix node unable to join when RF > N in multi-DC with added warning 
(CASSANDRA-16296)
  * Add an option to nodetool tablestats to check sstable location correctness 
(CASSANDRA-16344) 
  * Unable to ALTER KEYSPACE while decommissioned/assassinated nodes are in 
gossip (CASSANDRA-16422)
diff --git a/doc/source/cql/definitions.rst b/doc/source/cql/definitions.rst
index 3df6f20..8ccb630 100644
--- a/doc/source/cql/definitions.rst
+++ b/doc/source/cql/definitions.rst
@@ -46,8 +46,10 @@ To aid in specifying the CQL syntax, we will use the 
following conventions in th
 Identifiers and keywords
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
-The CQL language uses *identifiers* (or *names*) to identify tables, columns 
and other objects. An identifier is a token
-matching the regular expression ``[a-zA-Z][a-zA-Z0-9_]*``.
+The CQL language uses *identifiers* (or *names*) to identify tables, columns 
and other objects. An identifier can be either
+unquoted or quoted. The unquoted identifier is a token matching the regular 
expression ``[a-zA-Z][a-zA-Z0-9_]*``.
+In the case of quoted, the identity can contain non-ASCII characters between 
the quotation marks, with one exception that
+Cassandra does not accept non-ASCII characters, if the quoted identifier is 
used for keyspace name or table name.
 
 A number of such identifiers, like ``SELECT`` or ``WITH``, are *keywords*. 
They have a fixed meaning for the language
 and most are reserved. The list of those keywords can be found in 
:ref:`appendix-A`.
diff --git a/src/java/org/apache/cassandra/cql3/ResultSet.java 
b/src/java/org/apache/cassandra/cql3/ResultSet.java
index 4f60445..de393f7 100644
--- a/src/java/org/apache/cassandra/cql3/ResultSet.java
+++ b/src/java/org/apache/cassandra/cql3/ResultSet.java
@@ -452,7 +452,7 @@ public class ResultSet
                             CBUtil.writeAsciiString(name.ksName, dest);
                             CBUtil.writeAsciiString(name.cfName, dest);
                         }
-                        CBUtil.writeAsciiString(name.name.toString(), dest);
+                        CBUtil.writeString(name.name.toString(), dest);
                         DataType.codec.writeOne(DataType.fromType(name.type, 
version), dest, version);
                     }
                 }
@@ -488,7 +488,7 @@ public class ResultSet
                             size += CBUtil.sizeOfAsciiString(name.ksName);
                             size += CBUtil.sizeOfAsciiString(name.cfName);
                         }
-                        size += CBUtil.sizeOfAsciiString(name.name.toString());
+                        size += CBUtil.sizeOfString(name.name.toString());
                         size += 
DataType.codec.oneSerializedSize(DataType.fromType(name.type, version), 
version);
                     }
                 }
@@ -657,7 +657,7 @@ public class ResultSet
                         CBUtil.writeAsciiString(name.ksName, dest);
                         CBUtil.writeAsciiString(name.cfName, dest);
                     }
-                    CBUtil.writeAsciiString(name.name.toString(), dest);
+                    CBUtil.writeString(name.name.toString(), dest);
                     DataType.codec.writeOne(DataType.fromType(name.type, 
version), dest, version);
                 }
             }
@@ -682,7 +682,7 @@ public class ResultSet
                         size += CBUtil.sizeOfAsciiString(name.ksName);
                         size += CBUtil.sizeOfAsciiString(name.cfName);
                     }
-                    size += CBUtil.sizeOfAsciiString(name.name.toString());
+                    size += CBUtil.sizeOfString(name.name.toString());
                     size += 
DataType.codec.oneSerializedSize(DataType.fromType(name.type, version), 
version);
                 }
                 return size;
diff --git a/src/java/org/apache/cassandra/transport/DataType.java 
b/src/java/org/apache/cassandra/transport/DataType.java
index 9d45e8a..1d1a913 100644
--- a/src/java/org/apache/cassandra/transport/DataType.java
+++ b/src/java/org/apache/cassandra/transport/DataType.java
@@ -161,11 +161,11 @@ public enum DataType
             case UDT:
                 UserType udt = (UserType)value;
                 CBUtil.writeAsciiString(udt.keyspace, cb);
-                CBUtil.writeAsciiString(UTF8Type.instance.compose(udt.name), 
cb);
+                CBUtil.writeString(UTF8Type.instance.compose(udt.name), cb);
                 cb.writeShort(udt.size());
                 for (int i = 0; i < udt.size(); i++)
                 {
-                    CBUtil.writeAsciiString(udt.fieldName(i).toString(), cb);
+                    CBUtil.writeString(udt.fieldName(i).toString(), cb);
                     codec.writeOne(DataType.fromType(udt.fieldType(i), 
version), cb, version);
                 }
                 break;
@@ -201,11 +201,11 @@ public enum DataType
                 UserType udt = (UserType)value;
                 int size = 0;
                 size += CBUtil.sizeOfAsciiString(udt.keyspace);
-                size += 
CBUtil.sizeOfAsciiString(UTF8Type.instance.compose(udt.name));
+                size += 
CBUtil.sizeOfString(UTF8Type.instance.compose(udt.name));
                 size += 2;
                 for (int i = 0; i < udt.size(); i++)
                 {
-                    size += 
CBUtil.sizeOfAsciiString(udt.fieldName(i).toString());
+                    size += CBUtil.sizeOfString(udt.fieldName(i).toString());
                     size += 
codec.oneSerializedSize(DataType.fromType(udt.fieldType(i), version), version);
                 }
                 return size;
diff --git a/test/unit/org/apache/cassandra/transport/SerDeserTest.java 
b/test/unit/org/apache/cassandra/transport/SerDeserTest.java
index 42ffa26..da76070 100644
--- a/test/unit/org/apache/cassandra/transport/SerDeserTest.java
+++ b/test/unit/org/apache/cassandra/transport/SerDeserTest.java
@@ -20,6 +20,8 @@ package org.apache.cassandra.transport;
 import java.nio.ByteBuffer;
 import java.util.*;
 
+import org.apache.commons.lang3.RandomStringUtils;
+
 import io.netty.buffer.Unpooled;
 import io.netty.buffer.ByteBuf;
 
@@ -203,20 +205,32 @@ public class SerDeserTest
         SetType<?> st = SetType.getInstance(UTF8Type.instance, true);
         MapType<?, ?> mt = MapType.getInstance(UTF8Type.instance, 
LongType.instance, true);
 
+        String typeName = "myType" + randomUTF8(3);
+        String f1 = 'f' + randomUTF8(3);
+
         UserType udt = new UserType("ks",
-                                    bb("myType"),
-                                    Arrays.asList(field("f1"), field("f2"), 
field("f3"), field("f4")),
+                                    bb(typeName),
+                                    Arrays.asList(field(f1), field("f2"), 
field("f3"), field("f4")),
                                     Arrays.asList(LongType.instance, lt, st, 
mt),
                                     true);
 
         Map<FieldIdentifier, Term.Raw> value = new HashMap<>();
-        value.put(field("f1"), lit(42));
+        value.put(field(f1), lit(42));
         value.put(field("f2"), new 
Lists.Literal(Arrays.<Term.Raw>asList(lit(3), lit(1))));
         value.put(field("f3"), new 
Sets.Literal(Arrays.<Term.Raw>asList(lit("foo"), lit("bar"))));
         value.put(field("f4"), new Maps.Literal(Arrays.<Pair<Term.Raw, 
Term.Raw>>asList(
                                    Pair.<Term.Raw, Term.Raw>create(lit("foo"), 
lit(24)),
                                    Pair.<Term.Raw, Term.Raw>create(lit("bar"), 
lit(12)))));
 
+        ByteBuf buf = Unpooled.buffer(DataType.UDT.serializedValueSize(udt, 
version));
+        DataType.UDT.writeValue(udt, buf, version);
+        UserType decoded = (UserType) DataType.UDT.readValue(buf, version);
+        assertNotNull(decoded);
+        assertEquals("User type name mismatches: " + typeName,
+                     udt.name, decoded.name);
+        assertEquals("Decoded field name mismatches: " + f1,
+                     udt.fieldNameAsString(0), decoded.fieldNameAsString(0));
+
         UserTypes.Literal u = new UserTypes.Literal(value);
         Term t = u.prepare("ks", columnSpec("myValue", udt));
 
@@ -258,6 +272,9 @@ public class SerDeserTest
         List<ColumnSpecification> columnNames = new ArrayList<>();
         for (int i = 0; i < 3; i++)
             columnNames.add(new ColumnSpecification("ks", "cf", new 
ColumnIdentifier("col" + i, false), Int32Type.instance));
+        // add a column name that contains UTF-8 string (that is valid to 
cassandra)
+        String utf8ColName = "col" + randomUTF8(3);
+        columnNames.add(new ColumnSpecification("ks", "cf", new 
ColumnIdentifier(utf8ColName, false), Int32Type.instance));
 
         if (version == ProtocolVersion.V3)
         {
@@ -381,4 +398,11 @@ public class SerDeserTest
         assertEquals(options.getTimestamp(state), 
decodedOptions.getTimestamp(state));
         assertEquals(options.getNowInSeconds(state), 
decodedOptions.getNowInSeconds(state));
     }
+
+    // return utf8 string that contains no ascii chars
+    public static String randomUTF8(int count)
+    {
+        // valid for cassandra
+        return RandomStringUtils.random(count, 129, 0xD800, false, false);
+    }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to