This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.5 by this push: new 09b14f0968c [SPARK-45124][CONNET] Do not use local user ID for Local Relations 09b14f0968c is described below commit 09b14f0968cebe0f2c5c9a369935f27d4ea228f6 Author: Hyukjin Kwon <gurwls...@apache.org> AuthorDate: Tue Sep 12 14:59:44 2023 +0900 [SPARK-45124][CONNET] Do not use local user ID for Local Relations ### What changes were proposed in this pull request? This PR removes the use of `userId` and `sessionId` in `CachedLocalRelation` messages and subsequently make `SparkConnectPlanner` use the `userId`/`sessionId` of the active session rather than the user-provided information. ### Why are the changes needed? Allowing a fetch of a local relation using user-provided information is a potential security risk since this allows users to fetch arbitrary local relations. ### Does this PR introduce _any_ user-facing change? Virtually no. It will ignore the session id or user id that users set (but instead use internal ones that users cannot manipulate). ### How was this patch tested? Manually. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #42880 from HyukjinKwon/no-local-user. Authored-by: Hyukjin Kwon <gurwls...@apache.org> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> (cherry picked from commit 47d801e5e9ded3fb50d274a720ee7874e0b37cc3) Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- .../scala/org/apache/spark/sql/SparkSession.scala | 2 - .../main/protobuf/spark/connect/relations.proto | 10 +- .../sql/connect/planner/SparkConnectPlanner.scala | 2 +- python/pyspark/sql/connect/plan.py | 3 - python/pyspark/sql/connect/proto/relations_pb2.py | 160 ++++++++++----------- python/pyspark/sql/connect/proto/relations_pb2.pyi | 15 +- 6 files changed, 87 insertions(+), 105 deletions(-) diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala index 7882ea64013..7bd8fa59aea 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -134,8 +134,6 @@ class SparkSession private[sql] ( } else { val hash = client.cacheLocalRelation(arrowData, encoder.schema.json) builder.getCachedLocalRelationBuilder - .setUserId(client.userId) - .setSessionId(client.sessionId) .setHash(hash) } } else { diff --git a/connector/connect/common/src/main/protobuf/spark/connect/relations.proto b/connector/connect/common/src/main/protobuf/spark/connect/relations.proto index 8001b3cbcfa..f7f1315ede0 100644 --- a/connector/connect/common/src/main/protobuf/spark/connect/relations.proto +++ b/connector/connect/common/src/main/protobuf/spark/connect/relations.proto @@ -400,11 +400,11 @@ message LocalRelation { // A local relation that has been cached already. message CachedLocalRelation { - // (Required) An identifier of the user which created the local relation - string userId = 1; - - // (Required) An identifier of the Spark SQL session in which the user created the local relation. - string sessionId = 2; + // `userId` and `sessionId` fields are deleted since the server must always use the active + // session/user rather than arbitrary values provided by the client. It is never valid to access + // a local relation from a different session/user. + reserved 1, 2; + reserved "userId", "sessionId"; // (Required) A sha-256 hash of the serialized local relation in proto, see LocalRelation. string hash = 3; diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala index 2abbacc5a9b..641dfc5dcd3 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala @@ -970,7 +970,7 @@ class SparkConnectPlanner(val sessionHolder: SessionHolder) extends Logging { private def transformCachedLocalRelation(rel: proto.CachedLocalRelation): LogicalPlan = { val blockManager = session.sparkContext.env.blockManager - val blockId = CacheId(rel.getUserId, rel.getSessionId, rel.getHash) + val blockId = CacheId(sessionHolder.userId, sessionHolder.sessionId, rel.getHash) val bytes = blockManager.getLocalBytes(blockId) bytes .map { blockData => diff --git a/python/pyspark/sql/connect/plan.py b/python/pyspark/sql/connect/plan.py index 84fd013d014..196b1f119ba 100644 --- a/python/pyspark/sql/connect/plan.py +++ b/python/pyspark/sql/connect/plan.py @@ -398,9 +398,6 @@ class CachedLocalRelation(LogicalPlan): plan = self._create_proto_relation() clr = plan.cached_local_relation - if session._user_id: - clr.userId = session._user_id - clr.sessionId = session._session_id clr.hash = self._hash return plan diff --git a/python/pyspark/sql/connect/proto/relations_pb2.py b/python/pyspark/sql/connect/proto/relations_pb2.py index 3a0a7ff71fd..3f7e5794937 100644 --- a/python/pyspark/sql/connect/proto/relations_pb2.py +++ b/python/pyspark/sql/connect/proto/relations_pb2.py @@ -35,7 +35,7 @@ from pyspark.sql.connect.proto import catalog_pb2 as spark_dot_connect_dot_catal DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x1dspark/connect/relations.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1fspark/connect/expressions.proto\x1a\x19spark/connect/types.proto\x1a\x1bspark/connect/catalog.proto"\xe1\x18\n\x08Relation\x12\x35\n\x06\x63ommon\x18\x01 \x01(\x0b\x32\x1d.spark.connect.RelationCommonR\x06\x63ommon\x12)\n\x04read\x18\x02 \x01(\x0b\x32\x13.spark.connect.ReadH\x00R\x04read\x12\x32\n\x07project\x18\x03 \x01(\x0b\x32\x16.spark.connect.ProjectH\x00R\x07project\x12/\n\x06\x66il [...] + b'\n\x1dspark/connect/relations.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1fspark/connect/expressions.proto\x1a\x19spark/connect/types.proto\x1a\x1bspark/connect/catalog.proto"\xe1\x18\n\x08Relation\x12\x35\n\x06\x63ommon\x18\x01 \x01(\x0b\x32\x1d.spark.connect.RelationCommonR\x06\x63ommon\x12)\n\x04read\x18\x02 \x01(\x0b\x32\x13.spark.connect.ReadH\x00R\x04read\x12\x32\n\x07project\x18\x03 \x01(\x0b\x32\x16.spark.connect.ProjectH\x00R\x07project\x12/\n\x06\x66il [...] ) _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) @@ -111,85 +111,85 @@ if _descriptor._USE_C_DESCRIPTORS == False: _LOCALRELATION._serialized_start = 7090 _LOCALRELATION._serialized_end = 7179 _CACHEDLOCALRELATION._serialized_start = 7181 - _CACHEDLOCALRELATION._serialized_end = 7276 - _CACHEDREMOTERELATION._serialized_start = 7278 - _CACHEDREMOTERELATION._serialized_end = 7333 - _SAMPLE._serialized_start = 7336 - _SAMPLE._serialized_end = 7609 - _RANGE._serialized_start = 7612 - _RANGE._serialized_end = 7757 - _SUBQUERYALIAS._serialized_start = 7759 - _SUBQUERYALIAS._serialized_end = 7873 - _REPARTITION._serialized_start = 7876 - _REPARTITION._serialized_end = 8018 - _SHOWSTRING._serialized_start = 8021 - _SHOWSTRING._serialized_end = 8163 - _HTMLSTRING._serialized_start = 8165 - _HTMLSTRING._serialized_end = 8279 - _STATSUMMARY._serialized_start = 8281 - _STATSUMMARY._serialized_end = 8373 - _STATDESCRIBE._serialized_start = 8375 - _STATDESCRIBE._serialized_end = 8456 - _STATCROSSTAB._serialized_start = 8458 - _STATCROSSTAB._serialized_end = 8559 - _STATCOV._serialized_start = 8561 - _STATCOV._serialized_end = 8657 - _STATCORR._serialized_start = 8660 - _STATCORR._serialized_end = 8797 - _STATAPPROXQUANTILE._serialized_start = 8800 - _STATAPPROXQUANTILE._serialized_end = 8964 - _STATFREQITEMS._serialized_start = 8966 - _STATFREQITEMS._serialized_end = 9091 - _STATSAMPLEBY._serialized_start = 9094 - _STATSAMPLEBY._serialized_end = 9403 - _STATSAMPLEBY_FRACTION._serialized_start = 9295 - _STATSAMPLEBY_FRACTION._serialized_end = 9394 - _NAFILL._serialized_start = 9406 - _NAFILL._serialized_end = 9540 - _NADROP._serialized_start = 9543 - _NADROP._serialized_end = 9677 - _NAREPLACE._serialized_start = 9680 - _NAREPLACE._serialized_end = 9976 - _NAREPLACE_REPLACEMENT._serialized_start = 9835 - _NAREPLACE_REPLACEMENT._serialized_end = 9976 - _TODF._serialized_start = 9978 - _TODF._serialized_end = 10066 - _WITHCOLUMNSRENAMED._serialized_start = 10069 - _WITHCOLUMNSRENAMED._serialized_end = 10308 - _WITHCOLUMNSRENAMED_RENAMECOLUMNSMAPENTRY._serialized_start = 10241 - _WITHCOLUMNSRENAMED_RENAMECOLUMNSMAPENTRY._serialized_end = 10308 - _WITHCOLUMNS._serialized_start = 10310 - _WITHCOLUMNS._serialized_end = 10429 - _WITHWATERMARK._serialized_start = 10432 - _WITHWATERMARK._serialized_end = 10566 - _HINT._serialized_start = 10569 - _HINT._serialized_end = 10701 - _UNPIVOT._serialized_start = 10704 - _UNPIVOT._serialized_end = 11031 - _UNPIVOT_VALUES._serialized_start = 10961 - _UNPIVOT_VALUES._serialized_end = 11020 - _TOSCHEMA._serialized_start = 11033 - _TOSCHEMA._serialized_end = 11139 - _REPARTITIONBYEXPRESSION._serialized_start = 11142 - _REPARTITIONBYEXPRESSION._serialized_end = 11345 - _MAPPARTITIONS._serialized_start = 11348 - _MAPPARTITIONS._serialized_end = 11529 - _GROUPMAP._serialized_start = 11532 - _GROUPMAP._serialized_end = 12167 - _COGROUPMAP._serialized_start = 12170 - _COGROUPMAP._serialized_end = 12696 - _APPLYINPANDASWITHSTATE._serialized_start = 12699 - _APPLYINPANDASWITHSTATE._serialized_end = 13056 - _COMMONINLINEUSERDEFINEDTABLEFUNCTION._serialized_start = 13059 - _COMMONINLINEUSERDEFINEDTABLEFUNCTION._serialized_end = 13303 - _PYTHONUDTF._serialized_start = 13306 - _PYTHONUDTF._serialized_end = 13483 - _COLLECTMETRICS._serialized_start = 13486 - _COLLECTMETRICS._serialized_end = 13622 - _PARSE._serialized_start = 13625 - _PARSE._serialized_end = 14013 + _CACHEDLOCALRELATION._serialized_end = 7253 + _CACHEDREMOTERELATION._serialized_start = 7255 + _CACHEDREMOTERELATION._serialized_end = 7310 + _SAMPLE._serialized_start = 7313 + _SAMPLE._serialized_end = 7586 + _RANGE._serialized_start = 7589 + _RANGE._serialized_end = 7734 + _SUBQUERYALIAS._serialized_start = 7736 + _SUBQUERYALIAS._serialized_end = 7850 + _REPARTITION._serialized_start = 7853 + _REPARTITION._serialized_end = 7995 + _SHOWSTRING._serialized_start = 7998 + _SHOWSTRING._serialized_end = 8140 + _HTMLSTRING._serialized_start = 8142 + _HTMLSTRING._serialized_end = 8256 + _STATSUMMARY._serialized_start = 8258 + _STATSUMMARY._serialized_end = 8350 + _STATDESCRIBE._serialized_start = 8352 + _STATDESCRIBE._serialized_end = 8433 + _STATCROSSTAB._serialized_start = 8435 + _STATCROSSTAB._serialized_end = 8536 + _STATCOV._serialized_start = 8538 + _STATCOV._serialized_end = 8634 + _STATCORR._serialized_start = 8637 + _STATCORR._serialized_end = 8774 + _STATAPPROXQUANTILE._serialized_start = 8777 + _STATAPPROXQUANTILE._serialized_end = 8941 + _STATFREQITEMS._serialized_start = 8943 + _STATFREQITEMS._serialized_end = 9068 + _STATSAMPLEBY._serialized_start = 9071 + _STATSAMPLEBY._serialized_end = 9380 + _STATSAMPLEBY_FRACTION._serialized_start = 9272 + _STATSAMPLEBY_FRACTION._serialized_end = 9371 + _NAFILL._serialized_start = 9383 + _NAFILL._serialized_end = 9517 + _NADROP._serialized_start = 9520 + _NADROP._serialized_end = 9654 + _NAREPLACE._serialized_start = 9657 + _NAREPLACE._serialized_end = 9953 + _NAREPLACE_REPLACEMENT._serialized_start = 9812 + _NAREPLACE_REPLACEMENT._serialized_end = 9953 + _TODF._serialized_start = 9955 + _TODF._serialized_end = 10043 + _WITHCOLUMNSRENAMED._serialized_start = 10046 + _WITHCOLUMNSRENAMED._serialized_end = 10285 + _WITHCOLUMNSRENAMED_RENAMECOLUMNSMAPENTRY._serialized_start = 10218 + _WITHCOLUMNSRENAMED_RENAMECOLUMNSMAPENTRY._serialized_end = 10285 + _WITHCOLUMNS._serialized_start = 10287 + _WITHCOLUMNS._serialized_end = 10406 + _WITHWATERMARK._serialized_start = 10409 + _WITHWATERMARK._serialized_end = 10543 + _HINT._serialized_start = 10546 + _HINT._serialized_end = 10678 + _UNPIVOT._serialized_start = 10681 + _UNPIVOT._serialized_end = 11008 + _UNPIVOT_VALUES._serialized_start = 10938 + _UNPIVOT_VALUES._serialized_end = 10997 + _TOSCHEMA._serialized_start = 11010 + _TOSCHEMA._serialized_end = 11116 + _REPARTITIONBYEXPRESSION._serialized_start = 11119 + _REPARTITIONBYEXPRESSION._serialized_end = 11322 + _MAPPARTITIONS._serialized_start = 11325 + _MAPPARTITIONS._serialized_end = 11506 + _GROUPMAP._serialized_start = 11509 + _GROUPMAP._serialized_end = 12144 + _COGROUPMAP._serialized_start = 12147 + _COGROUPMAP._serialized_end = 12673 + _APPLYINPANDASWITHSTATE._serialized_start = 12676 + _APPLYINPANDASWITHSTATE._serialized_end = 13033 + _COMMONINLINEUSERDEFINEDTABLEFUNCTION._serialized_start = 13036 + _COMMONINLINEUSERDEFINEDTABLEFUNCTION._serialized_end = 13280 + _PYTHONUDTF._serialized_start = 13283 + _PYTHONUDTF._serialized_end = 13460 + _COLLECTMETRICS._serialized_start = 13463 + _COLLECTMETRICS._serialized_end = 13599 + _PARSE._serialized_start = 13602 + _PARSE._serialized_end = 13990 _PARSE_OPTIONSENTRY._serialized_start = 3987 _PARSE_OPTIONSENTRY._serialized_end = 4045 - _PARSE_PARSEFORMAT._serialized_start = 13914 - _PARSE_PARSEFORMAT._serialized_end = 14002 + _PARSE_PARSEFORMAT._serialized_start = 13891 + _PARSE_PARSEFORMAT._serialized_end = 13979 # @@protoc_insertion_point(module_scope) diff --git a/python/pyspark/sql/connect/proto/relations_pb2.pyi b/python/pyspark/sql/connect/proto/relations_pb2.pyi index 9cadd4acc52..007b92ef5f4 100644 --- a/python/pyspark/sql/connect/proto/relations_pb2.pyi +++ b/python/pyspark/sql/connect/proto/relations_pb2.pyi @@ -1647,28 +1647,15 @@ class CachedLocalRelation(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor - USERID_FIELD_NUMBER: builtins.int - SESSIONID_FIELD_NUMBER: builtins.int HASH_FIELD_NUMBER: builtins.int - userId: builtins.str - """(Required) An identifier of the user which created the local relation""" - sessionId: builtins.str - """(Required) An identifier of the Spark SQL session in which the user created the local relation.""" hash: builtins.str """(Required) A sha-256 hash of the serialized local relation in proto, see LocalRelation.""" def __init__( self, *, - userId: builtins.str = ..., - sessionId: builtins.str = ..., hash: builtins.str = ..., ) -> None: ... - def ClearField( - self, - field_name: typing_extensions.Literal[ - "hash", b"hash", "sessionId", b"sessionId", "userId", b"userId" - ], - ) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["hash", b"hash"]) -> None: ... global___CachedLocalRelation = CachedLocalRelation --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org