(systemds) branch main updated: [SYSTEMDS-3179] Builtin for GloVe cooccurrence matrix computation

mboehm7 Sun, 11 May 2025 06:24:19 -0700

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/main by this push:
     new 6fd08c0de8 [SYSTEMDS-3179] Builtin for GloVe cooccurrence matrix 
computation
6fd08c0de8 is described below

commit 6fd08c0de828773989879b8be696f251c6c1404d
Author: Samin <[email protected]>
AuthorDate: Sun May 11 15:21:33 2025 +0200

    [SYSTEMDS-3179] Builtin for GloVe cooccurrence matrix computation
    
    Closes #2200.
---
 scripts/builtin/cooccurrenceMatrix.dml             | 175 +++++++++++++++++++++
 .../java/org/apache/sysds/common/Builtins.java     |   1 +
 .../part1/BuiltinCooccurrenceMatrixTest.java       |  88 +++++++++++
 .../resources/datasets/GloVe/coocMatrixTest.csv    |   6 +
 .../functions/builtin/cooccurrenceMatrix.dml       |  25 +++
 5 files changed, 295 insertions(+)

diff --git a/scripts/builtin/cooccurrenceMatrix.dml 
b/scripts/builtin/cooccurrenceMatrix.dml
new file mode 100644
index 0000000000..86b8b9ca16
--- /dev/null
+++ b/scripts/builtin/cooccurrenceMatrix.dml
@@ -0,0 +1,175 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+#
+# The implementation is based on
+# https://github.com/stanfordnlp/GloVe/blob/master/src/cooccur.c
+#
+#-------------------------------------------------------------
+
+## Cleans and processes text data by removing punctuation, converting it to 
lowercase, and reformatting.
+## Adds an index column to the result.
+# INPUT:
+# 
------------------------------------------------------------------------------
+# S     (Frame[Unknown]): 1D input data frame containing text data.
+# 
------------------------------------------------------------------------------
+# OUTPUT:
+# 
------------------------------------------------------------------------------
+# result    (Frame[Unknown]): Processed text data with an index column.
+# 
------------------------------------------------------------------------------
+processText = function(Frame[Unknown] S) return (Frame[Unknown] result){
+    print("processText");
+    tmpStr = map(S[,1], "x -> x.replaceAll(\"[.]\", \"\")");
+    tmpStr = map(tmpStr, "x -> x.replaceAll(\"[^a-zA-Z\\s]\", \" \")");
+    tmpStr = map(tmpStr, "x -> x.toLowerCase()");
+    result = cbind(as.frame(seq(1, nrow(S), 1)), tmpStr);
+}
+
+## Tokenizes text data and retrieves word positions.
+# INPUT:
+# 
------------------------------------------------------------------------------
+# S           (Frame[Unknown]): 2D input text data with an index column.
+# maxTokens   (Int): Maximum number of tokens per text entry.
+# 
------------------------------------------------------------------------------
+# OUTPUT:
+# 
------------------------------------------------------------------------------
+# result  (Frame[Unknown]): Tokenized words.
+# docID   (Matrix[double]): Document ID matrix corresponding to tokens.
+# 
------------------------------------------------------------------------------
+getWordPosition = function(Frame[Unknown] S, Int maxTokens) return 
(Frame[Unknown] result, Matrix[double] docID){
+    print("getWordPosition");
+    jspec_pos = "{\"algo\": \"split\", \"out\": \"position\",\"out_params\": 
{\"sort_alpha\": false},\"id_cols\": [1],\"tokenize_col\": 2}";
+    wordPosition = tokenize(target=S, spec=jspec_pos, max_tokens=maxTokens);
+    result = wordPosition[,3];
+    docID = as.matrix(wordPosition[,1]);
+}
+
+## Encodes words into a numerical matrix format, retrieves the vocabulary 
size, and maps word indices.
+## Uses transformencode() to recode strings and find each unique string 
position in the co-occurrence matrix.
+# INPUT:
+# 
------------------------------------------------------------------------------
+# S     (Frame[Unknown]): 1D frame of tokenized word positions.
+# 
------------------------------------------------------------------------------
+# OUTPUT:
+# 
------------------------------------------------------------------------------
+# recodedWordPosition   (Matrix[double]): Encoded word positions as a 
numerical matrix.
+# tableSize            (Int): Number of distinct words in the input text 
(co-occurrence matrix size).
+# column               (Frame[Unknown]): Mapping of word indices to distinct 
words in the co-occurrence matrix.
+# 
------------------------------------------------------------------------------
+getRecodedMatrix = function(Frame[Unknown] S) return (Matrix[double] 
recodedWordPosition, Int tableSize, Frame[Unknown] column){
+    print("getRecodedMatrix");
+    [recodedWordPosition, M] = transformencode(target=S, 
spec="{ids:true,recode:[1]}");
+
+    distinctWord = map(M[,1], "s -> UtilFunctions.splitRecodeEntry(s)[0]");
+    index = map(M[,1], "s -> 
Integer.valueOf(UtilFunctions.splitRecodeEntry(s)[1])");
+    column = cbind(index, distinctWord);
+    sortedIndex = order(target=as.matrix(index), by=1, decreasing=FALSE, 
index.return=TRUE);
+
+    #TODO vectorize via order of frames
+    for(i in 1:nrow(sortedIndex)){
+        p = as.integer(as.scalar(sortedIndex[i,1]));
+        column[as.integer(as.scalar(index[p])), 2] = distinctWord[p];
+        column[i, 1] = as.scalar(index[p]);
+    }
+    tableSize = nrow(distinctWord);
+}
+
+## Iterates over the recoded word positions to construct a co-occurrence 
matrix.
+# INPUT:
+# 
------------------------------------------------------------------------------
+# recodedWordPosition (Matrix[double]): 2D matrix of recoded word positions 
with text IDs.
+# tableSize          (Int): Size of the vocabulary (number of unique words).
+# distanceWeighting  (Boolean): Flag to apply distance weighting to 
co-occurrence counts.
+# symmetric          (Boolean): Determines if the matrix is symmetric (TRUE) 
or asymmetric (FALSE).
+# windowSize        (Int): Context window size.
+# 
------------------------------------------------------------------------------
+# OUTPUT:
+# 
------------------------------------------------------------------------------
+# coocMatrix (Matrix[double]): Final word-word co-occurrence matrix.
+# 
------------------------------------------------------------------------------
+createCoocMatrix = function(
+    Matrix[double] recodedWordPosition,
+    Int tableSize,
+    boolean distanceWeighting,
+    boolean symmetric,
+    Int windowSize)
+return (Matrix[double] coocMatrix)
+{
+    print("Processing word cooccurrence...");
+    coocMatrix = matrix(0, tableSize, tableSize);
+
+    #TODO vectorize loop
+    for (i in 1:nrow(recodedWordPosition)) {
+        docId = as.integer(as.scalar(recodedWordPosition[i,1]));
+        wordIndex = as.integer(as.scalar(recodedWordPosition[i,2]));
+        if(wordIndex != 0){# This check is due to wrong result of the 
transformencode when running jvm test.
+            for (j in 1:windowSize) {
+                # Check left context
+                if (i-j > 0) {
+                    if(docId == as.integer(as.scalar(recodedWordPosition[i-j, 
1])))
+                    {
+                        neighbourWordIndex = 
as.integer(as.scalar(recodedWordPosition[i-j,2]));
+                        increase = ifelse(distanceWeighting, 1.0 / j, 1.0);
+                        coocMatrix[wordIndex, neighbourWordIndex] = 
coocMatrix[wordIndex, neighbourWordIndex] + increase;
+                    }
+                }
+                # Check right context if symmetric
+                if(symmetric == TRUE){
+                    if (i+j < nrow(recodedWordPosition) + 1) {
+                        if(docId == 
as.integer(as.scalar(recodedWordPosition[i+j, 1])))
+                        {
+                            neighbourWordIndex = 
as.integer(as.scalar(recodedWordPosition[i+j,2]));
+                            increase = ifelse(distanceWeighting, 1.0 / j, 1.0);
+                            coocMatrix[wordIndex, neighbourWordIndex] = 
coocMatrix[wordIndex, neighbourWordIndex] + increase;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    print("Word-word cooccurrence matrix computation completed.");
+}
+
+## Main function to process text data to construct a word-word co-occurrence 
matrix.
+# INPUT:
+# 
------------------------------------------------------------------------------
+# input            (Frame[Unknown]): 1DInput corpus in CSV format.
+# maxTokens        (Int): Maximum number of tokens per text entry.
+# windowSize       (Int): Context window size.
+# distanceWeighting (Boolean): Whether to apply distance-based weighting.
+# symmetric        (Boolean): Determines if the matrix is symmetric (TRUE) or 
asymmetric (FALSE).
+# 
------------------------------------------------------------------------------
+# OUTPUT:
+# 
------------------------------------------------------------------------------
+# coocMatrix (Matrix[double]): The computed co-occurrence matrix.
+# column     (Frame[Unknown]): Word-index mapping for the co-occurrence matrix.
+# 
------------------------------------------------------------------------------
+f_cooccurrenceMatrix = function(
+    Frame[Unknown] input,
+    Int maxTokens,
+    Int windowSize,
+    Boolean distanceWeighting,
+    Boolean symmetric) return (Matrix[Double] coocMatrix, Frame[Unknown] 
column){
+
+    processedResult = processText(input);
+    [wordPosition, docID] = getWordPosition(processedResult, maxTokens);
+    [recodedWordPosition, tableSize, column] = getRecodedMatrix(wordPosition);
+    coocMatrix = createCoocMatrix(cbind(docID, recodedWordPosition), 
tableSize, distanceWeighting, symmetric, windowSize);
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java 
b/src/main/java/org/apache/sysds/common/Builtins.java
index d8caf6d11b..92398a66b0 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -93,6 +93,7 @@ public enum Builtins {
        CONV2D("conv2d", false),
        CONV2D_BACKWARD_FILTER("conv2d_backward_filter", false),
        CONV2D_BACKWARD_DATA("conv2d_backward_data", false),
+       COOCCURRENCEMATRIX("cooccurrenceMatrix", true),
        COR("cor", true),
        CORRECTTYPOS("correctTypos", true),
        CORRECTTYPOSAPPLY("correctTyposApply", true),
diff --git 
a/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinCooccurrenceMatrixTest.java
 
b/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinCooccurrenceMatrixTest.java
new file mode 100644
index 0000000000..58b4aac457
--- /dev/null
+++ 
b/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinCooccurrenceMatrixTest.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin.part1;
+
+import org.apache.sysds.common.Types;
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class BuiltinCooccurrenceMatrixTest extends AutomatedTestBase {
+
+       private static final String TEST_NAME = "cooccurrenceMatrix";
+       private static final String TEST_DIR = "functions/builtin/";
+       private static final String RESOURCE_DIRECTORY = 
"src/test/resources/datasets/";
+       private static final String TEST_CLASS_DIR = TEST_DIR + 
BuiltinCooccurrenceMatrixTest.class.getSimpleName() + "/";
+       private static final double EPSILON = 1e-10; // Tolerance for comparison
+
+       @Override
+       public void setUp() {
+               addTestConfiguration(TEST_NAME,new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME,new String[]{"TestResult",}));
+       }
+
+       @Test
+       public void cooccurrenceMatrixTest() {
+               runCooccurrenceMatrix(20, 2, "FALSE", "TRUE");
+               HashMap<MatrixValue.CellIndex, Double> cooccurrenceMatrix = 
readDMLMatrixFromOutputDir("TestResult");
+               double[][] computedC = 
TestUtils.convertHashMapToDoubleArray(cooccurrenceMatrix);
+
+               // Unique words: {apple, banana, orange, grape}
+               // Co-occurrence based on word pairs in same sentences
+               double[][] expectedC = new double[][] {
+                               {0, 1, 2, 0},  // apple with {banana, orange}
+                               {1, 0, 3, 1},  // banana with {apple, orange, 
grape}
+                               {2, 3, 0, 2},  // orange with {apple, banana, 
grape}
+                               {0, 1, 2, 0}   // grape with {banana, orange, 
grape}
+               };
+
+               TestUtils.compareMatrices(expectedC, computedC, 
expectedC.length, expectedC[0].length, EPSILON);
+
+       }
+
+       public void runCooccurrenceMatrix(Integer maxTokens, Integer 
windowSize, String distanceWeighting, String symmetric) {
+               // Load test configuration
+               Types.ExecMode platformOld = setExecMode(Types.ExecType.CP);
+               try{
+                       loadTestConfiguration(getTestConfiguration(TEST_NAME));
+
+                       String HOME = SCRIPT_DIR + TEST_DIR;
+
+                       fullDMLScriptName = HOME + TEST_NAME + ".dml";
+
+                       programArgs = new String[]{"-nvargs",
+                                       "input=" + RESOURCE_DIRECTORY + 
"GloVe/coocMatrixTest.csv",
+                                       "maxTokens=" + maxTokens,
+                                       "windowSize=" + windowSize,
+                                       "distanceWeighting=" + 
distanceWeighting,
+                                       "symmetric=" + symmetric,
+                                       "out_file=" + output("TestResult")};
+                       System.out.println("Run dml script..");
+                       runTest(true, false, null, -1);
+                       System.out.println("DONE");
+               }
+               finally {
+                       rtplatform = platformOld;
+               }
+       }
+}
diff --git a/src/test/resources/datasets/GloVe/coocMatrixTest.csv 
b/src/test/resources/datasets/GloVe/coocMatrixTest.csv
new file mode 100644
index 0000000000..b495147c5b
--- /dev/null
+++ b/src/test/resources/datasets/GloVe/coocMatrixTest.csv
@@ -0,0 +1,6 @@
+apple banana orange.
+banana orange grape.
+apple. orange
+grape 1111 ------ orange.
+------ <<<<<<< 1111 22222.
+banana orange
diff --git a/src/test/scripts/functions/builtin/cooccurrenceMatrix.dml 
b/src/test/scripts/functions/builtin/cooccurrenceMatrix.dml
new file mode 100644
index 0000000000..591e228903
--- /dev/null
+++ b/src/test/scripts/functions/builtin/cooccurrenceMatrix.dml
@@ -0,0 +1,25 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+input = read($input, data_type="frame", format="csv", sep=",", header=FALSE);
+
+[coocMatrix,  column] = cooccurrenceMatrix(input, $maxTokens, $windowSize, 
$distanceWeighting, $symmetric);
+write(coocMatrix, $out_file , data_type="matrix");

(systemds) branch main updated: [SYSTEMDS-3179] Builtin for GloVe cooccurrence matrix computation

Reply via email to