This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 6fd08c0de8 [SYSTEMDS-3179] Builtin for GloVe cooccurrence matrix
computation
6fd08c0de8 is described below
commit 6fd08c0de828773989879b8be696f251c6c1404d
Author: Samin <[email protected]>
AuthorDate: Sun May 11 15:21:33 2025 +0200
[SYSTEMDS-3179] Builtin for GloVe cooccurrence matrix computation
Closes #2200.
---
scripts/builtin/cooccurrenceMatrix.dml | 175 +++++++++++++++++++++
.../java/org/apache/sysds/common/Builtins.java | 1 +
.../part1/BuiltinCooccurrenceMatrixTest.java | 88 +++++++++++
.../resources/datasets/GloVe/coocMatrixTest.csv | 6 +
.../functions/builtin/cooccurrenceMatrix.dml | 25 +++
5 files changed, 295 insertions(+)
diff --git a/scripts/builtin/cooccurrenceMatrix.dml
b/scripts/builtin/cooccurrenceMatrix.dml
new file mode 100644
index 0000000000..86b8b9ca16
--- /dev/null
+++ b/scripts/builtin/cooccurrenceMatrix.dml
@@ -0,0 +1,175 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+#
+# The implementation is based on
+# https://github.com/stanfordnlp/GloVe/blob/master/src/cooccur.c
+#
+#-------------------------------------------------------------
+
+## Cleans and processes text data by removing punctuation, converting it to
lowercase, and reformatting.
+## Adds an index column to the result.
+# INPUT:
+#
------------------------------------------------------------------------------
+# S (Frame[Unknown]): 1D input data frame containing text data.
+#
------------------------------------------------------------------------------
+# OUTPUT:
+#
------------------------------------------------------------------------------
+# result (Frame[Unknown]): Processed text data with an index column.
+#
------------------------------------------------------------------------------
+processText = function(Frame[Unknown] S) return (Frame[Unknown] result){
+ print("processText");
+ tmpStr = map(S[,1], "x -> x.replaceAll(\"[.]\", \"\")");
+ tmpStr = map(tmpStr, "x -> x.replaceAll(\"[^a-zA-Z\\s]\", \" \")");
+ tmpStr = map(tmpStr, "x -> x.toLowerCase()");
+ result = cbind(as.frame(seq(1, nrow(S), 1)), tmpStr);
+}
+
+## Tokenizes text data and retrieves word positions.
+# INPUT:
+#
------------------------------------------------------------------------------
+# S (Frame[Unknown]): 2D input text data with an index column.
+# maxTokens (Int): Maximum number of tokens per text entry.
+#
------------------------------------------------------------------------------
+# OUTPUT:
+#
------------------------------------------------------------------------------
+# result (Frame[Unknown]): Tokenized words.
+# docID (Matrix[double]): Document ID matrix corresponding to tokens.
+#
------------------------------------------------------------------------------
+getWordPosition = function(Frame[Unknown] S, Int maxTokens) return
(Frame[Unknown] result, Matrix[double] docID){
+ print("getWordPosition");
+ jspec_pos = "{\"algo\": \"split\", \"out\": \"position\",\"out_params\":
{\"sort_alpha\": false},\"id_cols\": [1],\"tokenize_col\": 2}";
+ wordPosition = tokenize(target=S, spec=jspec_pos, max_tokens=maxTokens);
+ result = wordPosition[,3];
+ docID = as.matrix(wordPosition[,1]);
+}
+
+## Encodes words into a numerical matrix format, retrieves the vocabulary
size, and maps word indices.
+## Uses transformencode() to recode strings and find each unique string
position in the co-occurrence matrix.
+# INPUT:
+#
------------------------------------------------------------------------------
+# S (Frame[Unknown]): 1D frame of tokenized word positions.
+#
------------------------------------------------------------------------------
+# OUTPUT:
+#
------------------------------------------------------------------------------
+# recodedWordPosition (Matrix[double]): Encoded word positions as a
numerical matrix.
+# tableSize (Int): Number of distinct words in the input text
(co-occurrence matrix size).
+# column (Frame[Unknown]): Mapping of word indices to distinct
words in the co-occurrence matrix.
+#
------------------------------------------------------------------------------
+getRecodedMatrix = function(Frame[Unknown] S) return (Matrix[double]
recodedWordPosition, Int tableSize, Frame[Unknown] column){
+ print("getRecodedMatrix");
+ [recodedWordPosition, M] = transformencode(target=S,
spec="{ids:true,recode:[1]}");
+
+ distinctWord = map(M[,1], "s -> UtilFunctions.splitRecodeEntry(s)[0]");
+ index = map(M[,1], "s ->
Integer.valueOf(UtilFunctions.splitRecodeEntry(s)[1])");
+ column = cbind(index, distinctWord);
+ sortedIndex = order(target=as.matrix(index), by=1, decreasing=FALSE,
index.return=TRUE);
+
+ #TODO vectorize via order of frames
+ for(i in 1:nrow(sortedIndex)){
+ p = as.integer(as.scalar(sortedIndex[i,1]));
+ column[as.integer(as.scalar(index[p])), 2] = distinctWord[p];
+ column[i, 1] = as.scalar(index[p]);
+ }
+ tableSize = nrow(distinctWord);
+}
+
+## Iterates over the recoded word positions to construct a co-occurrence
matrix.
+# INPUT:
+#
------------------------------------------------------------------------------
+# recodedWordPosition (Matrix[double]): 2D matrix of recoded word positions
with text IDs.
+# tableSize (Int): Size of the vocabulary (number of unique words).
+# distanceWeighting (Boolean): Flag to apply distance weighting to
co-occurrence counts.
+# symmetric (Boolean): Determines if the matrix is symmetric (TRUE)
or asymmetric (FALSE).
+# windowSize (Int): Context window size.
+#
------------------------------------------------------------------------------
+# OUTPUT:
+#
------------------------------------------------------------------------------
+# coocMatrix (Matrix[double]): Final word-word co-occurrence matrix.
+#
------------------------------------------------------------------------------
+createCoocMatrix = function(
+ Matrix[double] recodedWordPosition,
+ Int tableSize,
+ boolean distanceWeighting,
+ boolean symmetric,
+ Int windowSize)
+return (Matrix[double] coocMatrix)
+{
+ print("Processing word cooccurrence...");
+ coocMatrix = matrix(0, tableSize, tableSize);
+
+ #TODO vectorize loop
+ for (i in 1:nrow(recodedWordPosition)) {
+ docId = as.integer(as.scalar(recodedWordPosition[i,1]));
+ wordIndex = as.integer(as.scalar(recodedWordPosition[i,2]));
+ if(wordIndex != 0){# This check is due to wrong result of the
transformencode when running jvm test.
+ for (j in 1:windowSize) {
+ # Check left context
+ if (i-j > 0) {
+ if(docId == as.integer(as.scalar(recodedWordPosition[i-j,
1])))
+ {
+ neighbourWordIndex =
as.integer(as.scalar(recodedWordPosition[i-j,2]));
+ increase = ifelse(distanceWeighting, 1.0 / j, 1.0);
+ coocMatrix[wordIndex, neighbourWordIndex] =
coocMatrix[wordIndex, neighbourWordIndex] + increase;
+ }
+ }
+ # Check right context if symmetric
+ if(symmetric == TRUE){
+ if (i+j < nrow(recodedWordPosition) + 1) {
+ if(docId ==
as.integer(as.scalar(recodedWordPosition[i+j, 1])))
+ {
+ neighbourWordIndex =
as.integer(as.scalar(recodedWordPosition[i+j,2]));
+ increase = ifelse(distanceWeighting, 1.0 / j, 1.0);
+ coocMatrix[wordIndex, neighbourWordIndex] =
coocMatrix[wordIndex, neighbourWordIndex] + increase;
+ }
+ }
+ }
+ }
+ }
+ }
+ print("Word-word cooccurrence matrix computation completed.");
+}
+
+## Main function to process text data to construct a word-word co-occurrence
matrix.
+# INPUT:
+#
------------------------------------------------------------------------------
+# input (Frame[Unknown]): 1DInput corpus in CSV format.
+# maxTokens (Int): Maximum number of tokens per text entry.
+# windowSize (Int): Context window size.
+# distanceWeighting (Boolean): Whether to apply distance-based weighting.
+# symmetric (Boolean): Determines if the matrix is symmetric (TRUE) or
asymmetric (FALSE).
+#
------------------------------------------------------------------------------
+# OUTPUT:
+#
------------------------------------------------------------------------------
+# coocMatrix (Matrix[double]): The computed co-occurrence matrix.
+# column (Frame[Unknown]): Word-index mapping for the co-occurrence matrix.
+#
------------------------------------------------------------------------------
+f_cooccurrenceMatrix = function(
+ Frame[Unknown] input,
+ Int maxTokens,
+ Int windowSize,
+ Boolean distanceWeighting,
+ Boolean symmetric) return (Matrix[Double] coocMatrix, Frame[Unknown]
column){
+
+ processedResult = processText(input);
+ [wordPosition, docID] = getWordPosition(processedResult, maxTokens);
+ [recodedWordPosition, tableSize, column] = getRecodedMatrix(wordPosition);
+ coocMatrix = createCoocMatrix(cbind(docID, recodedWordPosition),
tableSize, distanceWeighting, symmetric, windowSize);
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java
b/src/main/java/org/apache/sysds/common/Builtins.java
index d8caf6d11b..92398a66b0 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -93,6 +93,7 @@ public enum Builtins {
CONV2D("conv2d", false),
CONV2D_BACKWARD_FILTER("conv2d_backward_filter", false),
CONV2D_BACKWARD_DATA("conv2d_backward_data", false),
+ COOCCURRENCEMATRIX("cooccurrenceMatrix", true),
COR("cor", true),
CORRECTTYPOS("correctTypos", true),
CORRECTTYPOSAPPLY("correctTyposApply", true),
diff --git
a/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinCooccurrenceMatrixTest.java
b/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinCooccurrenceMatrixTest.java
new file mode 100644
index 0000000000..58b4aac457
--- /dev/null
+++
b/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinCooccurrenceMatrixTest.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin.part1;
+
+import org.apache.sysds.common.Types;
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class BuiltinCooccurrenceMatrixTest extends AutomatedTestBase {
+
+ private static final String TEST_NAME = "cooccurrenceMatrix";
+ private static final String TEST_DIR = "functions/builtin/";
+ private static final String RESOURCE_DIRECTORY =
"src/test/resources/datasets/";
+ private static final String TEST_CLASS_DIR = TEST_DIR +
BuiltinCooccurrenceMatrixTest.class.getSimpleName() + "/";
+ private static final double EPSILON = 1e-10; // Tolerance for comparison
+
+ @Override
+ public void setUp() {
+ addTestConfiguration(TEST_NAME,new
TestConfiguration(TEST_CLASS_DIR, TEST_NAME,new String[]{"TestResult",}));
+ }
+
+ @Test
+ public void cooccurrenceMatrixTest() {
+ runCooccurrenceMatrix(20, 2, "FALSE", "TRUE");
+ HashMap<MatrixValue.CellIndex, Double> cooccurrenceMatrix =
readDMLMatrixFromOutputDir("TestResult");
+ double[][] computedC =
TestUtils.convertHashMapToDoubleArray(cooccurrenceMatrix);
+
+ // Unique words: {apple, banana, orange, grape}
+ // Co-occurrence based on word pairs in same sentences
+ double[][] expectedC = new double[][] {
+ {0, 1, 2, 0}, // apple with {banana, orange}
+ {1, 0, 3, 1}, // banana with {apple, orange,
grape}
+ {2, 3, 0, 2}, // orange with {apple, banana,
grape}
+ {0, 1, 2, 0} // grape with {banana, orange,
grape}
+ };
+
+ TestUtils.compareMatrices(expectedC, computedC,
expectedC.length, expectedC[0].length, EPSILON);
+
+ }
+
+ public void runCooccurrenceMatrix(Integer maxTokens, Integer
windowSize, String distanceWeighting, String symmetric) {
+ // Load test configuration
+ Types.ExecMode platformOld = setExecMode(Types.ExecType.CP);
+ try{
+ loadTestConfiguration(getTestConfiguration(TEST_NAME));
+
+ String HOME = SCRIPT_DIR + TEST_DIR;
+
+ fullDMLScriptName = HOME + TEST_NAME + ".dml";
+
+ programArgs = new String[]{"-nvargs",
+ "input=" + RESOURCE_DIRECTORY +
"GloVe/coocMatrixTest.csv",
+ "maxTokens=" + maxTokens,
+ "windowSize=" + windowSize,
+ "distanceWeighting=" +
distanceWeighting,
+ "symmetric=" + symmetric,
+ "out_file=" + output("TestResult")};
+ System.out.println("Run dml script..");
+ runTest(true, false, null, -1);
+ System.out.println("DONE");
+ }
+ finally {
+ rtplatform = platformOld;
+ }
+ }
+}
diff --git a/src/test/resources/datasets/GloVe/coocMatrixTest.csv
b/src/test/resources/datasets/GloVe/coocMatrixTest.csv
new file mode 100644
index 0000000000..b495147c5b
--- /dev/null
+++ b/src/test/resources/datasets/GloVe/coocMatrixTest.csv
@@ -0,0 +1,6 @@
+apple banana orange.
+banana orange grape.
+apple. orange
+grape 1111 ------ orange.
+------ <<<<<<< 1111 22222.
+banana orange
diff --git a/src/test/scripts/functions/builtin/cooccurrenceMatrix.dml
b/src/test/scripts/functions/builtin/cooccurrenceMatrix.dml
new file mode 100644
index 0000000000..591e228903
--- /dev/null
+++ b/src/test/scripts/functions/builtin/cooccurrenceMatrix.dml
@@ -0,0 +1,25 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+input = read($input, data_type="frame", format="csv", sep=",", header=FALSE);
+
+[coocMatrix, column] = cooccurrenceMatrix(input, $maxTokens, $windowSize,
$distanceWeighting, $symmetric);
+write(coocMatrix, $out_file , data_type="matrix");