Author: koji
Date: Mon Dec 10 00:28:01 2012
New Revision: 1419182
URL: http://svn.apache.org/viewvc?rev=1419182&view=rev
Log:
add draft version of Java programs
Added:
labs/alike/trunk/src/main/java/org/
labs/alike/trunk/src/main/java/org/apache/
labs/alike/trunk/src/main/java/org/apache/alike/
labs/alike/trunk/src/main/java/org/apache/alike/Clustering.java
labs/alike/trunk/src/main/java/org/apache/alike/HistogramMatching.java
Added: labs/alike/trunk/src/main/java/org/apache/alike/Clustering.java
URL:
http://svn.apache.org/viewvc/labs/alike/trunk/src/main/java/org/apache/alike/Clustering.java?rev=1419182&view=auto
==============================================================================
--- labs/alike/trunk/src/main/java/org/apache/alike/Clustering.java (added)
+++ labs/alike/trunk/src/main/java/org/apache/alike/Clustering.java Mon Dec 10
00:28:01 2012
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.alike;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.mahout.clustering.kmeans.KMeansDriver;
+import org.apache.mahout.clustering.kmeans.Kluster;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+public class Clustering {
+
+ static final int K = 500;
+ static final int MAX_ITE = 500;
+ // TODO: make parameterization
+ static final String SRC_DIR =
"/Users/koji/Project/rondhuit/JAIST/SUB-THEME/out";
+ static final String D_IN_POINTS = "testdata/points";
+ static final String F_IN_INPUT_VECTORS = D_IN_POINTS + "/input-vectors";
+ static final String D_IN_CLUSTERS = "testdata/clusters";
+ static final String F_IN_INIT_CLUSTERS = D_IN_CLUSTERS + "/part-00000";
+ static final String D_OUT = "output";
+ static List<NamedVector> initVectors = new ArrayList<NamedVector>();
+
+ static Configuration conf;
+ static FileSystem fs;
+
+ public static void main(String[] args) throws Exception {
+ long startTime = System.currentTimeMillis();
+
+ conf = new Configuration();
+ fs = FileSystem.get(conf);
+
+ prepareInputVectors();
+ initClusters();
+ computeClustering();
+
+ int elapseSec = (int)((System.currentTimeMillis() - startTime) / 1000);
+ System.out.printf("elapse = %d sec\n", elapseSec);
+ }
+
+ static void prepareInputVectors() throws IOException {
+ new File(D_IN_POINTS).mkdirs();
+
+ File srcDirFile = new File(SRC_DIR);
+ File[] txtFiles = srcDirFile.listFiles(new FileFilter() {
+ public boolean accept(File pathname) {
+ return pathname.getName().endsWith(".txt");
+ }
+ });
+
+ Path path = new Path(F_IN_INPUT_VECTORS);
+ SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf,
+ path, LongWritable.class, VectorWritable.class);
+ VectorWritable vec = new VectorWritable();
+
+ long recNum = 0;
+ for(File txtFile : txtFiles){
+ List<NamedVector> nvList = getNamedVectorsFromFile(txtFile);
+ for (Vector vector : nvList) {
+ vec.set(vector);
+ writer.append(new LongWritable(recNum++), vec);
+ }
+ }
+
+ writer.close();
+ }
+
+ static List<NamedVector> getNamedVectorsFromFile(File txtFile) throws
IOException {
+ List<NamedVector> nvList = new ArrayList<NamedVector>();
+ BufferedReader br = new BufferedReader(new FileReader(txtFile));
+ String line = br.readLine(); // skip file name
+ line = br.readLine(); // skip number of lines count
+ StringBuilder sb = new StringBuilder();
+ int num = 0;
+ while((line = br.readLine()) != null){
+ String[] strValues = line.trim().split("\\s+");
+ double[] values = new double[strValues.length];
+ for(int i = 0; i < strValues.length; i++){
+ values[i] = Double.parseDouble(strValues[i]);
+ }
+ sb.setLength(0);
+ sb.append(txtFile.getName()).append('_').append(num);
+ NamedVector nv = new NamedVector(new
RandomAccessSparseVector(values.length), sb.toString());
+ nv.assign(values);
+ nvList.add(nv);
+
+ // if preparation for initial clusters is needed then do it!
+ if(initVectors.size() < K){
+ initVectors.add(nv);
+ }
+
+ num++;
+ }
+ br.close();
+
+ return nvList;
+ }
+
+ static void initClusters() throws IOException {
+ Path path = new Path(F_IN_INIT_CLUSTERS);
+ SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf,
+ path, Text.class, Kluster.class);
+
+ for (int i = 0; i < K; i++) {
+ Vector vec = initVectors.get(i);
+ Kluster cluster = new Kluster(vec, i, new EuclideanDistanceMeasure());
+ writer.append(new Text(cluster.getIdentifier()), cluster);
+ }
+ writer.close();
+ }
+
+ static void computeClustering() throws IOException, InterruptedException,
ClassNotFoundException {
+
+ KMeansDriver.run(conf, new Path(D_IN_POINTS), new Path(D_IN_CLUSTERS),
+ new Path(D_OUT), new EuclideanDistanceMeasure(), 0.001, MAX_ITE,
+ true, 0.1, false);
+ }
+}
Added: labs/alike/trunk/src/main/java/org/apache/alike/HistogramMatching.java
URL:
http://svn.apache.org/viewvc/labs/alike/trunk/src/main/java/org/apache/alike/HistogramMatching.java?rev=1419182&view=auto
==============================================================================
--- labs/alike/trunk/src/main/java/org/apache/alike/HistogramMatching.java
(added)
+++ labs/alike/trunk/src/main/java/org/apache/alike/HistogramMatching.java Mon
Dec 10 00:28:01 2012
@@ -0,0 +1,282 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.alike;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+
+
+public class HistogramMatching {
+
+ static final int K = 500;
+ static final int D = 64;
+ // TODO: make parameterization
+ static final String SRC_DIR =
"/Users/koji/Project/rondhuit/JAIST/SUB-THEME/out";
+ //static final String D_IN_CLUSTERS = "testdata/clusters";
+ //static final String F_IN_INIT_CLUSTERS = D_IN_CLUSTERS + "/part-00000";
+ //static final String D_OUT = "output";
+ static final String F_IN_CENTROIDS = "centroids.txt";
+
+ static Configuration conf;
+ static FileSystem fs;
+
+ public static void main(String[] args) throws IOException {
+ double[][] centroids = getCentroids();
+ Map<String, int[]> histograms = makeHistograms(centroids);
+ //printHistograms(histograms);
+ printForSolr(histograms);
+ test(histograms);
+ }
+
+ static double[][] getCentroids() throws IOException {
+ double[][] centroids = new double[K][D];
+
+ BufferedReader br = new BufferedReader(new FileReader(F_IN_CENTROIDS));
+ String line = null;
+ int i = 0;
+ while((line = br.readLine()) != null){
+ int sp = line.indexOf("c=[") + "c=[".length();
+ int ep = line.indexOf("] r=[");
+ //System.out.printf("\"%s\"\n", line.substring(sp, ep));
+ String[] strValues = line.substring(sp, ep).trim().split(",\\s*");
+ if(strValues.length < D){
+ // may be sparse vector representation is used...
+ for(String sv : strValues){
+ int col = sv.indexOf(':');
+ int j = Integer.parseInt(sv.substring(0, col));
+ centroids[i][j] = Double.parseDouble(sv.substring(col + 1));
+ }
+ }
+ else{
+ for(int j = 0; j < D; j++){
+ centroids[i][j] = Double.parseDouble(strValues[j]);
+ }
+ }
+ i++;
+ }
+ br.close();
+
+ return centroids;
+ }
+
+ static Map<String, int[]> makeHistograms(double[][] centroids) throws
IOException {
+ Map<String, int[]> histogramMap = new HashMap<String, int[]>();
+
+ File srcDirFile = new File(SRC_DIR);
+ File[] txtFiles = srcDirFile.listFiles(new FileFilter() {
+ public boolean accept(File pathname) {
+ return pathname.getName().endsWith(".txt");
+ }
+ });
+
+ for(File txtFile : txtFiles){
+ String key = txtFile.getName();
+ histogramMap.put(key, makeHistgram(centroids, txtFile));
+ }
+
+ return histogramMap;
+ }
+
+ static int[] makeHistgram(double[][] centroids, File txtFile) throws
IOException {
+ int[] histgram = new int[K];
+
+ BufferedReader br = new BufferedReader(new FileReader(txtFile));
+ String line = br.readLine(); // skip file name
+ line = br.readLine(); // skip number of lines count
+ while((line = br.readLine()) != null){
+ String[] strValues = line.trim().split("\\s+");
+ double[] desc = new double[strValues.length];
+ for(int i = 0; i < strValues.length; i++){
+ desc[i] = Double.parseDouble(strValues[i]);
+ }
+
+ voteForVisualWord(histgram, centroids, desc);
+ }
+ br.close();
+
+ return histgram;
+ }
+
+ static void voteForVisualWord(int[] histgram, double[][] centroids, double[]
desc){
+ double minDistance = Double.MAX_VALUE;
+ int pos = Integer.MAX_VALUE;
+ for(int i = 0; i < K; i++){
+ double distance = computeSimilarity(centroids[i], desc);
+ if(minDistance > distance){
+ minDistance = distance;
+ pos = i;
+ }
+ }
+
+ // vote for minimum distance
+ histgram[pos]++;
+ }
+
+ static double computeSimilarity(double[] centroid, double[] desc){
+ double sum = 0;
+ for(int i = 0; i < D; i++){
+ sum += (centroid[i] - desc[i]) * (centroid[i] - desc[i]);
+ }
+ return Math.sqrt(sum);
+ }
+
+ static void printHistograms(Map<String, int[]> histograms){
+ for(String key : histograms.keySet()){
+
System.out.println("\n------------------------------------------------------------");
+ System.out.println(key);
+ int[] histogram = histograms.get(key);
+ for(int i = 0; i < 100; i++){
+ int v = histogram[i];
+ for(int j = 0; j < v; j++){
+ System.out.print("*");
+ }
+ System.out.println(); // for LF
+ }
+ }
+ }
+
+ static void printForSolr(Map<String, int[]> histograms) throws IOException {
+ PrintWriter pw = new PrintWriter("demo-data.xml");
+ pw.println("<add>");
+ for(String key : histograms.keySet()){
+ pw.println("<doc>");
+ printImageFileNameField(pw, key);
+ printAssembledQueryField(pw, histograms.get(key));
+ printHistogramField(pw, histograms.get(key));
+ pw.println("</doc>");
+ }
+ pw.println("</add>");
+ pw.close();
+ }
+
+ private static void printImageFileNameField(PrintWriter pw, String key)
throws IOException {
+ int idx = key.indexOf('-');
+ String value = "images/" + key.substring(0, idx) + "/" + key.substring(idx
+ 1).replace(".txt", ".jpg");
+ printField(pw, "imgFile", value);
+ }
+
+ private static void printAssembledQueryField(PrintWriter pw, int[]
histogram) throws IOException {
+ StringBuilder sb = new StringBuilder();
+ for(int i = 0; i < histogram.length; i++){
+ if(histogram[i] > 0){
+ String q = Integer.toString(i) + "^" + Integer.toString(histogram[i]);
+ sb.append(q).append(' ');
+ }
+ }
+
+ printField(pw, "query", sb.toString().trim());
+ }
+
+ private static void printHistogramField(PrintWriter pw, int[] histogram)
throws IOException {
+ StringBuilder sb = new StringBuilder();
+ for(int i = 0; i < histogram.length; i++){
+ int v = histogram[i];
+ for(int j = 0; j < v; j++){
+ sb.append(Integer.toString(i)).append(' ');
+ }
+ }
+
+ printField(pw, "histogram", sb.toString().trim());
+ }
+
+ static void printField(PrintWriter pw, String name, String value) throws
IOException {
+ pw.printf(" <field name=\"%s\">%s</field>\n", name, value);
+ }
+
+ static void test(Map<String, int[]> histograms){
+ Comparator<KeyScorePair> c = new KeyScorePairComparator();
+ for(String key : histograms.keySet()){
+ if(!key.endsWith("0010.txt")) continue;
+
+ int[] srcHisto = histograms.get(key);
+ List<KeyScorePair> list = new
ArrayList<HistogramMatching.KeyScorePair>();
+ for(Map.Entry<String, int[]> entry : histograms.entrySet()){
+ int[] destHisto = entry.getValue();
+ list.add(new KeyScorePair(entry.getKey(), cosine(srcHisto,
destHisto)));
+ //list.add(new KeyScorePair(entry.getKey(), intersection(srcHisto,
destHisto)));
+ }
+ Collections.sort(list, c);
+
+ System.out.printf("\n%s\n", key);
+ for(int i = 0; i < 10; i++){
+ KeyScorePair ksp = list.get(i);
+ System.out.printf("\t%s, %f\n", ksp.key, ksp.score);
+ }
+ }
+ }
+
+ public static double intersection(int[] v1, int[] v2){
+ int sum = 0;
+ for(int i = 0; i < v1.length; i++){
+ sum += Math.min(v1[i], v2[i]);
+ }
+
+ return (double)sum;
+ }
+
+ public static double cosine(int[] v1, int[] v2){
+ long numerator = 0;
+ for(int i = 0; i < v1.length; i++){
+ numerator += v1[i] * v2[i];
+ }
+ if(numerator == 0) return 0;
+ double denominator = getSumSquareRoot(v1) * getSumSquareRoot(v2);
+
+ // shouldn't be occurred, but let's avoid zero devide
+ if(denominator == 0.0) return 0;
+
+ return numerator / denominator;
+ }
+
+ public static double getSumSquareRoot(int[] v){
+ double sum = 0;
+ for(int i = 0; i < v.length; i++){
+ sum += v[i] * v[i];
+ }
+
+ return Math.sqrt(sum);
+ }
+
+ static class KeyScorePair {
+ String key;
+ double score;
+ public KeyScorePair(String key, double score){
+ this.key = key;
+ this.score = score;
+ }
+ }
+
+ static class KeyScorePairComparator implements Comparator<KeyScorePair> {
+ public int compare(KeyScorePair arg0, KeyScorePair arg1) {
+ return arg0.score > arg1.score ? -1 : 1;
+ }
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]