Iseratho commented on a change in pull request #993:
URL: https://github.com/apache/systemds/pull/993#discussion_r457611110



##########
File path: scripts/staging/entity-resolution/entity-clustering.dml
##########
@@ -0,0 +1,119 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# THIS SCRIPT PERFORMS AN ENTITY RESOLUTION PIPELINE FOR CLUSTERING ON A 
SINGLE FILE
+# CONSISTS OF BLOCKING, MATCHING, AND CLUSTERING
+#
+# INPUT PARAMETERS:
+# 
---------------------------------------------------------------------------------------------
+# NAME           TYPE   DEFAULT  MEANING
+# 
---------------------------------------------------------------------------------------------
+# FX              String  ---     Location to read the frame of tokens in bow 
format
+#                                 Each line contains comma separated list of 
id, token and value
+# OUT             String  ---     Location to save the output of maching pairs
+#                                 Each line contains comma separated ids of 
one matched pair
+#                                 Third column provides the similarity score
+# threshold       Double  0.9     Threshold to be considered as a match
+# blocking_method String  naive   Possible values: ["naive", "lsh"].
+# num_blocks      Int     1       Number of blocks for naive blocking
+# num_hashtables  Int     6       Number of hashtables for LSH blocking.
+# num_hyperplanes Int     4       Number of hyperplanes for LSH blocking.
+
+# use_tokens      Boolean TRUE    Whether to use the tokens of FX to generate 
predictions
+# use_embeddings  Boolean FALSE   Whether to use the embeddings of XE to 
generate predictions
+# XE              String  ---     Location to read the frame of embedding 
matrix
+#                                 Required if use_embeddings is set to TRUE
+# store_mapping   Boolean FALSE   Whether to store the mapping of 
transformencode
+# MX              String  ---     Location to write the frame of mapping
+#                                Required if store_mapping is set to TRUE
+# 
---------------------------------------------------------------------------------------------
+# OUTPUT: frame of maching pairs
+# 
---------------------------------------------------------------------------------------------
+
+source("./scripts/staging/entity-resolution/primitives/preprocessing.dml") as 
pre;
+source("./scripts/staging/entity-resolution/primitives/postprocessing.dml") as 
post;
+source("./scripts/staging/entity-resolution/primitives/pipeline.dml") as pipe;
+
+# Command Line Arguments
+fileFX = $FX;
+fileOUT = $OUT;
+
+threshold = ifdef($threshold, 0.9);
+blocking_method = ifdef($blocking_method, "lsh");
+num_blocks = ifdef($num_blocks, 1);
+num_hyperplanes = ifdef($num_hyperplanes, 4);
+num_hashtables = ifdef($num_hashtables, 6);
+use_tokens = ifdef($use_tokens, TRUE);
+use_embeddings = ifdef($use_embeddings, FALSE);
+# file XE is only required if using embeddings
+fileXE = ifdef($XE, "");
+# mapping file is required for evaluation
+store_mapping = ifdef($store_mapping, FALSE);
+fileMX = ifdef($MX, "");
+
+if (!(blocking_method == "naive" | blocking_method == "lsh")) {
+  print("ERROR: blocking method must be in ['naive', 'lsh']");
+}
+
+# Read data
+FX = read(fileFX);
+if (use_embeddings) {
+  if (fileXE == "") {
+    print("You need to specify file XE when use_embeddings is set to TRUE");
+  } else {
+    X_embeddings = read(fileXE);
+  }
+}
+
+# Convert data
+[X, MX] = pre::convert_frame_tokens_to_matrix_bow(FX);
+if (use_tokens & use_embeddings) {
+  X = cbind(X, X_embeddings);
+} else if (use_tokens) {
+  # Nothing to do in this case, since X already contains tokens
+} else if (use_embeddings) {
+  X = X_embeddings;
+} else {
+  print("Either use_tokens or use_embeddings needs to be TRUE, using tokens 
only as default.");
+}
+
+if (store_mapping) {
+  if (fileMX == "") {
+    print("You need to specify file MX when store_mapping is set to TRUE.");
+  } else {
+    write(MX, fileMX);
+  }
+}
+
+# Perform clustering
+if (blocking_method == "naive") {
+  CLUSTER = pipe::entity_clustering_pipeline(X, num_blocks, threshold);
+} else if (blocking_method == "lsh") {
+  CLUSTER = pipe::entity_clustering_pipeline_lsh(X, num_hashtables, 
num_hyperplanes, threshold);
+}

Review comment:
       These two pipelines differ in their blocking technique. The performance 
depends on the values chosen for the blocking algorithm (i.e., num_blocks, 
num_hashtables, and num_hyperplanes).
   
   I am not sure whether kmeans makes much sense in this case (for duplicate 
detection). How would you select the number of clusters?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to