This is an automated email from the ASF dual-hosted git repository.
ycai pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/cassandra.git
The following commit(s) were added to refs/heads/trunk by this push:
new f32475a Fix incorrect encoding for strings can be UTF8
f32475a is described below
commit f32475a839e01e4eea3989871d293d70e8a360d7
Author: Yifan Cai <[email protected]>
AuthorDate: Mon Feb 22 13:39:16 2021 -0800
Fix incorrect encoding for strings can be UTF8
patch by Yifan Cai; reviewed by Ekaterina Dimitrova for CASSANDRA-16429
---
CHANGES.txt | 1 +
doc/source/cql/definitions.rst | 6 +++--
src/java/org/apache/cassandra/cql3/ResultSet.java | 8 +++---
.../org/apache/cassandra/transport/DataType.java | 8 +++---
.../apache/cassandra/transport/SerDeserTest.java | 30 +++++++++++++++++++---
5 files changed, 40 insertions(+), 13 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 6d311eb..4d3372b 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,5 @@
4.0-beta5
+ * Fix incorrect encoding for strings can be UTF8 (CASSANDRA-16429)
* Fix node unable to join when RF > N in multi-DC with added warning
(CASSANDRA-16296)
* Add an option to nodetool tablestats to check sstable location correctness
(CASSANDRA-16344)
* Unable to ALTER KEYSPACE while decommissioned/assassinated nodes are in
gossip (CASSANDRA-16422)
diff --git a/doc/source/cql/definitions.rst b/doc/source/cql/definitions.rst
index 3df6f20..8ccb630 100644
--- a/doc/source/cql/definitions.rst
+++ b/doc/source/cql/definitions.rst
@@ -46,8 +46,10 @@ To aid in specifying the CQL syntax, we will use the
following conventions in th
Identifiers and keywords
^^^^^^^^^^^^^^^^^^^^^^^^
-The CQL language uses *identifiers* (or *names*) to identify tables, columns
and other objects. An identifier is a token
-matching the regular expression ``[a-zA-Z][a-zA-Z0-9_]*``.
+The CQL language uses *identifiers* (or *names*) to identify tables, columns
and other objects. An identifier can be either
+unquoted or quoted. The unquoted identifier is a token matching the regular
expression ``[a-zA-Z][a-zA-Z0-9_]*``.
+In the case of quoted, the identity can contain non-ASCII characters between
the quotation marks, with one exception that
+Cassandra does not accept non-ASCII characters, if the quoted identifier is
used for keyspace name or table name.
A number of such identifiers, like ``SELECT`` or ``WITH``, are *keywords*.
They have a fixed meaning for the language
and most are reserved. The list of those keywords can be found in
:ref:`appendix-A`.
diff --git a/src/java/org/apache/cassandra/cql3/ResultSet.java
b/src/java/org/apache/cassandra/cql3/ResultSet.java
index 4f60445..de393f7 100644
--- a/src/java/org/apache/cassandra/cql3/ResultSet.java
+++ b/src/java/org/apache/cassandra/cql3/ResultSet.java
@@ -452,7 +452,7 @@ public class ResultSet
CBUtil.writeAsciiString(name.ksName, dest);
CBUtil.writeAsciiString(name.cfName, dest);
}
- CBUtil.writeAsciiString(name.name.toString(), dest);
+ CBUtil.writeString(name.name.toString(), dest);
DataType.codec.writeOne(DataType.fromType(name.type,
version), dest, version);
}
}
@@ -488,7 +488,7 @@ public class ResultSet
size += CBUtil.sizeOfAsciiString(name.ksName);
size += CBUtil.sizeOfAsciiString(name.cfName);
}
- size += CBUtil.sizeOfAsciiString(name.name.toString());
+ size += CBUtil.sizeOfString(name.name.toString());
size +=
DataType.codec.oneSerializedSize(DataType.fromType(name.type, version),
version);
}
}
@@ -657,7 +657,7 @@ public class ResultSet
CBUtil.writeAsciiString(name.ksName, dest);
CBUtil.writeAsciiString(name.cfName, dest);
}
- CBUtil.writeAsciiString(name.name.toString(), dest);
+ CBUtil.writeString(name.name.toString(), dest);
DataType.codec.writeOne(DataType.fromType(name.type,
version), dest, version);
}
}
@@ -682,7 +682,7 @@ public class ResultSet
size += CBUtil.sizeOfAsciiString(name.ksName);
size += CBUtil.sizeOfAsciiString(name.cfName);
}
- size += CBUtil.sizeOfAsciiString(name.name.toString());
+ size += CBUtil.sizeOfString(name.name.toString());
size +=
DataType.codec.oneSerializedSize(DataType.fromType(name.type, version),
version);
}
return size;
diff --git a/src/java/org/apache/cassandra/transport/DataType.java
b/src/java/org/apache/cassandra/transport/DataType.java
index 9d45e8a..1d1a913 100644
--- a/src/java/org/apache/cassandra/transport/DataType.java
+++ b/src/java/org/apache/cassandra/transport/DataType.java
@@ -161,11 +161,11 @@ public enum DataType
case UDT:
UserType udt = (UserType)value;
CBUtil.writeAsciiString(udt.keyspace, cb);
- CBUtil.writeAsciiString(UTF8Type.instance.compose(udt.name),
cb);
+ CBUtil.writeString(UTF8Type.instance.compose(udt.name), cb);
cb.writeShort(udt.size());
for (int i = 0; i < udt.size(); i++)
{
- CBUtil.writeAsciiString(udt.fieldName(i).toString(), cb);
+ CBUtil.writeString(udt.fieldName(i).toString(), cb);
codec.writeOne(DataType.fromType(udt.fieldType(i),
version), cb, version);
}
break;
@@ -201,11 +201,11 @@ public enum DataType
UserType udt = (UserType)value;
int size = 0;
size += CBUtil.sizeOfAsciiString(udt.keyspace);
- size +=
CBUtil.sizeOfAsciiString(UTF8Type.instance.compose(udt.name));
+ size +=
CBUtil.sizeOfString(UTF8Type.instance.compose(udt.name));
size += 2;
for (int i = 0; i < udt.size(); i++)
{
- size +=
CBUtil.sizeOfAsciiString(udt.fieldName(i).toString());
+ size += CBUtil.sizeOfString(udt.fieldName(i).toString());
size +=
codec.oneSerializedSize(DataType.fromType(udt.fieldType(i), version), version);
}
return size;
diff --git a/test/unit/org/apache/cassandra/transport/SerDeserTest.java
b/test/unit/org/apache/cassandra/transport/SerDeserTest.java
index 42ffa26..da76070 100644
--- a/test/unit/org/apache/cassandra/transport/SerDeserTest.java
+++ b/test/unit/org/apache/cassandra/transport/SerDeserTest.java
@@ -20,6 +20,8 @@ package org.apache.cassandra.transport;
import java.nio.ByteBuffer;
import java.util.*;
+import org.apache.commons.lang3.RandomStringUtils;
+
import io.netty.buffer.Unpooled;
import io.netty.buffer.ByteBuf;
@@ -203,20 +205,32 @@ public class SerDeserTest
SetType<?> st = SetType.getInstance(UTF8Type.instance, true);
MapType<?, ?> mt = MapType.getInstance(UTF8Type.instance,
LongType.instance, true);
+ String typeName = "myType" + randomUTF8(3);
+ String f1 = 'f' + randomUTF8(3);
+
UserType udt = new UserType("ks",
- bb("myType"),
- Arrays.asList(field("f1"), field("f2"),
field("f3"), field("f4")),
+ bb(typeName),
+ Arrays.asList(field(f1), field("f2"),
field("f3"), field("f4")),
Arrays.asList(LongType.instance, lt, st,
mt),
true);
Map<FieldIdentifier, Term.Raw> value = new HashMap<>();
- value.put(field("f1"), lit(42));
+ value.put(field(f1), lit(42));
value.put(field("f2"), new
Lists.Literal(Arrays.<Term.Raw>asList(lit(3), lit(1))));
value.put(field("f3"), new
Sets.Literal(Arrays.<Term.Raw>asList(lit("foo"), lit("bar"))));
value.put(field("f4"), new Maps.Literal(Arrays.<Pair<Term.Raw,
Term.Raw>>asList(
Pair.<Term.Raw, Term.Raw>create(lit("foo"),
lit(24)),
Pair.<Term.Raw, Term.Raw>create(lit("bar"),
lit(12)))));
+ ByteBuf buf = Unpooled.buffer(DataType.UDT.serializedValueSize(udt,
version));
+ DataType.UDT.writeValue(udt, buf, version);
+ UserType decoded = (UserType) DataType.UDT.readValue(buf, version);
+ assertNotNull(decoded);
+ assertEquals("User type name mismatches: " + typeName,
+ udt.name, decoded.name);
+ assertEquals("Decoded field name mismatches: " + f1,
+ udt.fieldNameAsString(0), decoded.fieldNameAsString(0));
+
UserTypes.Literal u = new UserTypes.Literal(value);
Term t = u.prepare("ks", columnSpec("myValue", udt));
@@ -258,6 +272,9 @@ public class SerDeserTest
List<ColumnSpecification> columnNames = new ArrayList<>();
for (int i = 0; i < 3; i++)
columnNames.add(new ColumnSpecification("ks", "cf", new
ColumnIdentifier("col" + i, false), Int32Type.instance));
+ // add a column name that contains UTF-8 string (that is valid to
cassandra)
+ String utf8ColName = "col" + randomUTF8(3);
+ columnNames.add(new ColumnSpecification("ks", "cf", new
ColumnIdentifier(utf8ColName, false), Int32Type.instance));
if (version == ProtocolVersion.V3)
{
@@ -381,4 +398,11 @@ public class SerDeserTest
assertEquals(options.getTimestamp(state),
decodedOptions.getTimestamp(state));
assertEquals(options.getNowInSeconds(state),
decodedOptions.getNowInSeconds(state));
}
+
+ // return utf8 string that contains no ascii chars
+ public static String randomUTF8(int count)
+ {
+ // valid for cassandra
+ return RandomStringUtils.random(count, 129, 0xD800, false, false);
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]