allisonwang-db commented on code in PR #53481: URL: https://github.com/apache/spark/pull/53481#discussion_r2624397050
########## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/vectorExpressions.scala: ########## @@ -0,0 +1,611 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions + +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch +import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} +import org.apache.spark.sql.catalyst.util.ArrayData +import org.apache.spark.sql.errors.{QueryErrorsBase, QueryExecutionErrors} +import org.apache.spark.sql.types.{ArrayType, DataType, FloatType} + +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = """ + _FUNC_(vector1, vector2) - Returns the cosine similarity between two float vectors. Review Comment: Instead of using vector, shall we say array1 and array2? ########## sql/core/src/test/resources/sql-tests/inputs/vector-distance.sql: ########## @@ -0,0 +1,109 @@ +-- Tests for vector distance functions: vector_cosine_similarity, vector_inner_product, vector_l2_distance + +-- Basic functionality tests + +-- vector_cosine_similarity: basic test +SELECT vector_cosine_similarity(array(1.0F, 2.0F, 3.0F), array(4.0F, 5.0F, 6.0F)); + +-- vector_cosine_similarity: identical vectors (similarity = 1.0) +SELECT vector_cosine_similarity(array(1.0F, 0.0F, 0.0F), array(1.0F, 0.0F, 0.0F)); + +-- vector_cosine_similarity: orthogonal vectors (similarity = 0.0) +SELECT vector_cosine_similarity(array(1.0F, 0.0F), array(0.0F, 1.0F)); + +-- vector_cosine_similarity: opposite vectors (similarity = -1.0) +SELECT vector_cosine_similarity(array(1.0F, 0.0F), array(-1.0F, 0.0F)); + +-- vector_inner_product: basic test (1*4 + 2*5 + 3*6 = 32) +SELECT vector_inner_product(array(1.0F, 2.0F, 3.0F), array(4.0F, 5.0F, 6.0F)); + +-- vector_inner_product: orthogonal vectors (product = 0) +SELECT vector_inner_product(array(1.0F, 0.0F), array(0.0F, 1.0F)); + +-- vector_inner_product: self product (squared L2 norm: 3^2 + 4^2 = 25) +SELECT vector_inner_product(array(3.0F, 4.0F), array(3.0F, 4.0F)); + +-- vector_l2_distance: basic test (sqrt((4-1)^2 + (5-2)^2 + (6-3)^2) = sqrt(27)) +SELECT vector_l2_distance(array(1.0F, 2.0F, 3.0F), array(4.0F, 5.0F, 6.0F)); + +-- vector_l2_distance: identical vectors (distance = 0) +SELECT vector_l2_distance(array(1.0F, 2.0F), array(1.0F, 2.0F)); + +-- vector_l2_distance: 3-4-5 triangle (distance = 5) +SELECT vector_l2_distance(array(0.0F, 0.0F), array(3.0F, 4.0F)); + +-- Edge cases + +-- Empty vectors: cosine similarity returns NULL +SELECT vector_cosine_similarity(array(), array()); + +-- Empty vectors: inner product returns 0.0 +SELECT vector_inner_product(CAST(array() AS ARRAY<FLOAT>), CAST(array() AS ARRAY<FLOAT>)); + +-- Empty vectors: L2 distance returns 0.0 +SELECT vector_l2_distance(CAST(array() AS ARRAY<FLOAT>), CAST(array() AS ARRAY<FLOAT>)); + +-- Zero magnitude vector: cosine similarity returns NULL +SELECT vector_cosine_similarity(array(0.0F, 0.0F, 0.0F), array(1.0F, 2.0F, 3.0F)); + +-- NULL array input: cosine similarity returns NULL +SELECT vector_cosine_similarity(NULL, array(1.0F, 2.0F, 3.0F)); +SELECT vector_cosine_similarity(array(1.0F, 2.0F, 3.0F), NULL); + +-- NULL array input: inner product returns NULL +SELECT vector_inner_product(NULL, array(1.0F, 2.0F, 3.0F)); +SELECT vector_inner_product(array(1.0F, 2.0F, 3.0F), NULL); + +-- NULL array input: L2 distance returns NULL +SELECT vector_l2_distance(NULL, array(1.0F, 2.0F, 3.0F)); +SELECT vector_l2_distance(array(1.0F, 2.0F, 3.0F), NULL); + +-- Array containing NULL element: returns NULL +SELECT vector_cosine_similarity(array(1.0F, CAST(NULL AS FLOAT), 3.0F), array(1.0F, 2.0F, 3.0F)); +SELECT vector_inner_product(array(1.0F, CAST(NULL AS FLOAT), 3.0F), array(1.0F, 2.0F, 3.0F)); +SELECT vector_l2_distance(array(1.0F, CAST(NULL AS FLOAT), 3.0F), array(1.0F, 2.0F, 3.0F)); + +-- Dimension mismatch errors Review Comment: How about type mismatch error, e.g when the first/second argument is not array. ########## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/vectorExpressions.scala: ########## @@ -0,0 +1,611 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions + +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch +import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} +import org.apache.spark.sql.catalyst.util.ArrayData +import org.apache.spark.sql.errors.{QueryErrorsBase, QueryExecutionErrors} +import org.apache.spark.sql.types.{ArrayType, DataType, FloatType} + +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = """ + _FUNC_(vector1, vector2) - Returns the cosine similarity between two float vectors. + The vectors must have the same dimension. + """, + examples = """ + Examples: + > SELECT _FUNC_(array(1.0F, 2.0F, 3.0F), array(4.0F, 5.0F, 6.0F)); + 0.97463185 + """, + since = "4.1.0", + group = "misc_funcs" Review Comment: Shall we add a new group for these functions? ########## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/vectorExpressions.scala: ########## @@ -0,0 +1,611 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions + +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch +import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} +import org.apache.spark.sql.catalyst.util.ArrayData +import org.apache.spark.sql.errors.{QueryErrorsBase, QueryExecutionErrors} +import org.apache.spark.sql.types.{ArrayType, DataType, FloatType} + +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = """ + _FUNC_(vector1, vector2) - Returns the cosine similarity between two float vectors. + The vectors must have the same dimension. + """, + examples = """ + Examples: + > SELECT _FUNC_(array(1.0F, 2.0F, 3.0F), array(4.0F, 5.0F, 6.0F)); + 0.97463185 + """, + since = "4.1.0", Review Comment: Should be "4.2.0" -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
