hailin0 commented on code in PR #8603:
URL: https://github.com/apache/seatunnel/pull/8603#discussion_r1958234416
##########
seatunnel-plugin-discovery/src/main/java/org/apache/seatunnel/plugin/discovery/AbstractPluginDiscovery.java:
##########
@@ -455,104 +453,53 @@ public boolean accept(File pathname) {
}
}
- private static File findMostSimlarPluginJarFile(
- File[] targetPluginFiles, String pluginJarPrefix) {
- String splitRegex = "\\-|\\_|\\.";
- double maxSimlarity = -Integer.MAX_VALUE;
- int mostSimlarPluginJarFileIndex = -1;
- for (int i = 0; i < targetPluginFiles.length; i++) {
- File file = targetPluginFiles[i];
- String fileName = file.getName();
- double similarity =
- CosineSimilarityUtil.cosineSimilarity(pluginJarPrefix,
fileName, splitRegex);
- if (similarity > maxSimlarity) {
- maxSimlarity = similarity;
- mostSimlarPluginJarFileIndex = i;
- }
+ private Optional<URL> selectPluginJar(
+ File[] targetPluginFiles, String pluginJarPrefix, String
pluginName, PluginType type) {
+ List<URL> resMatchedUrls = new ArrayList<>();
+ for (File file : targetPluginFiles) {
+ Optional<URL> matchedUrl = findMatchingUrl(file, type);
+ matchedUrl.ifPresent(resMatchedUrls::add);
+ }
+ if (resMatchedUrls.size() != 1) {
+ throw new SeaTunnelException(
+ String.format(
+ "Cannot find unique plugin jar for
pluginIdentifier: %s -> %s",
+ pluginName, pluginJarPrefix));
+ } else {
+ return Optional.of(resMatchedUrls.get(0));
}
- return targetPluginFiles[mostSimlarPluginJarFileIndex];
}
- static class CosineSimilarityUtil {
- public static double cosineSimilarity(String textA, String textB,
String splitRegrex) {
- Set<String> words1 =
- new
HashSet<>(Arrays.asList(textA.toLowerCase().split(splitRegrex)));
- Set<String> words2 =
- new
HashSet<>(Arrays.asList(textB.toLowerCase().split(splitRegrex)));
- int[] termFrequency1 = calculateTermFrequencyVector(textA, words1,
splitRegrex);
- int[] termFrequency2 = calculateTermFrequencyVector(textB, words2,
splitRegrex);
- return calculateCosineSimilarity(termFrequency1, termFrequency2);
+ private Optional<URL> findMatchingUrl(File file, PluginType type) {
+ Map<PluginIdentifier, String> pluginInstanceMap = null;
+ switch (type) {
+ case SINK:
+ pluginInstanceMap = sinkPluginInstance;
+ break;
+ case SOURCE:
+ pluginInstanceMap = sourcePluginInstance;
+ break;
+ case TRANSFORM:
+ pluginInstanceMap = transformPluginInstance;
+ break;
}
-
- private static int[] calculateTermFrequencyVector(
- String text, Set<String> words, String splitRegrex) {
- int[] termFrequencyVector = new int[words.size()];
- String[] textArray = text.toLowerCase().split(splitRegrex);
- List<String> orderedWords = new ArrayList<String>();
- words.clear();
- for (String word : textArray) {
- if (!words.contains(word)) {
- orderedWords.add(word);
- words.add(word);
- }
- }
- for (String word : textArray) {
- if (words.contains(word)) {
- int index = 0;
- for (String w : orderedWords) {
- if (w.equals(word)) {
- termFrequencyVector[index]++;
- break;
- }
- index++;
- }
- }
- }
- return termFrequencyVector;
+ if (pluginInstanceMap == null) {
+ return Optional.empty();
}
-
- private static double calculateCosineSimilarity(int[] vectorA, int[]
vectorB) {
- double dotProduct = 0.0;
- double magnitudeA = 0.0;
- double magnitudeB = 0.0;
- int vectorALength = vectorA.length;
- int vectorBLength = vectorB.length;
- if (vectorALength < vectorBLength) {
- int[] vectorTemp = new int[vectorBLength];
- for (int i = 0; i < vectorB.length; i++) {
- if (i <= vectorALength - 1) {
- vectorTemp[i] = vectorA[i];
- } else {
- vectorTemp[i] = 0;
- }
+ List<URL> matchedUrls = new ArrayList<>();
+ for (Map.Entry<PluginIdentifier, String> entry :
pluginInstanceMap.entrySet()) {
+ if (file.getName().startsWith(entry.getValue())) {
+ try {
+ matchedUrls.add(file.toURI().toURL());
+ } catch (MalformedURLException e) {
+ log.warn("Cannot get plugin URL for pluginIdentifier: {}",
file, e);
}
- vectorA = vectorTemp;
- }
- if (vectorALength > vectorBLength) {
- int[] vectorTemp = new int[vectorALength];
- for (int i = 0; i < vectorA.length; i++) {
- if (i <= vectorBLength - 1) {
- vectorTemp[i] = vectorB[i];
- } else {
- vectorTemp[i] = 0;
- }
- }
- vectorB = vectorTemp;
- }
- for (int i = 0; i < vectorA.length; i++) {
- dotProduct += vectorA[i] * vectorB[i];
- magnitudeA += Math.pow(vectorA[i], 2);
- magnitudeB += Math.pow(vectorB[i], 2);
}
+ }
- magnitudeA = Math.sqrt(magnitudeA);
- magnitudeB = Math.sqrt(magnitudeB);
-
- if (magnitudeA == 0 || magnitudeB == 0) {
- return 0.0; // Avoid dividing by 0
- } else {
- return dotProduct / (magnitudeA * magnitudeB);
- }
+ if (matchedUrls.size() == 1) {
+ return Optional.of(matchedUrls.get(0));
}
+ return Optional.empty();
Review Comment:
add debug logs
##########
seatunnel-plugin-discovery/src/main/java/org/apache/seatunnel/plugin/discovery/AbstractPluginDiscovery.java:
##########
@@ -455,104 +453,53 @@ public boolean accept(File pathname) {
}
}
- private static File findMostSimlarPluginJarFile(
- File[] targetPluginFiles, String pluginJarPrefix) {
- String splitRegex = "\\-|\\_|\\.";
- double maxSimlarity = -Integer.MAX_VALUE;
- int mostSimlarPluginJarFileIndex = -1;
- for (int i = 0; i < targetPluginFiles.length; i++) {
- File file = targetPluginFiles[i];
- String fileName = file.getName();
- double similarity =
- CosineSimilarityUtil.cosineSimilarity(pluginJarPrefix,
fileName, splitRegex);
- if (similarity > maxSimlarity) {
- maxSimlarity = similarity;
- mostSimlarPluginJarFileIndex = i;
- }
+ private Optional<URL> selectPluginJar(
+ File[] targetPluginFiles, String pluginJarPrefix, String
pluginName, PluginType type) {
+ List<URL> resMatchedUrls = new ArrayList<>();
+ for (File file : targetPluginFiles) {
+ Optional<URL> matchedUrl = findMatchingUrl(file, type);
+ matchedUrl.ifPresent(resMatchedUrls::add);
+ }
+ if (resMatchedUrls.size() != 1) {
+ throw new SeaTunnelException(
+ String.format(
+ "Cannot find unique plugin jar for
pluginIdentifier: %s -> %s",
Review Comment:
add matchedUrl into error message
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]