HistogramMatching.java

koji Sun, 09 Dec 2012 16:28:26 -0800

Author: koji
Date: Mon Dec 10 00:28:01 2012
New Revision: 1419182

URL: http://svn.apache.org/viewvc?rev=1419182&view=rev
Log:
add draft version of Java programs


Added:
    labs/alike/trunk/src/main/java/org/
    labs/alike/trunk/src/main/java/org/apache/
    labs/alike/trunk/src/main/java/org/apache/alike/
    labs/alike/trunk/src/main/java/org/apache/alike/Clustering.java
    labs/alike/trunk/src/main/java/org/apache/alike/HistogramMatching.java

Added: labs/alike/trunk/src/main/java/org/apache/alike/Clustering.java
URL: 
http://svn.apache.org/viewvc/labs/alike/trunk/src/main/java/org/apache/alike/Clustering.java?rev=1419182&view=auto
==============================================================================
--- labs/alike/trunk/src/main/java/org/apache/alike/Clustering.java (added)
+++ labs/alike/trunk/src/main/java/org/apache/alike/Clustering.java Mon Dec 10 
00:28:01 2012
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.alike;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.mahout.clustering.kmeans.KMeansDriver;
+import org.apache.mahout.clustering.kmeans.Kluster;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+public class Clustering {
+
+  static final int K = 500;
+  static final int MAX_ITE = 500;
+  // TODO: make parameterization
+  static final String SRC_DIR = 
"/Users/koji/Project/rondhuit/JAIST/SUB-THEME/out";
+  static final String D_IN_POINTS = "testdata/points";
+  static final String F_IN_INPUT_VECTORS = D_IN_POINTS + "/input-vectors";
+  static final String D_IN_CLUSTERS = "testdata/clusters";
+  static final String F_IN_INIT_CLUSTERS = D_IN_CLUSTERS + "/part-00000";
+  static final String D_OUT = "output";
+  static List<NamedVector> initVectors = new ArrayList<NamedVector>();
+
+  static Configuration conf;
+  static FileSystem fs;
+  
+  public static void main(String[] args) throws Exception {
+    long startTime = System.currentTimeMillis();
+    
+    conf = new Configuration();
+    fs = FileSystem.get(conf);
+
+    prepareInputVectors();
+    initClusters();
+    computeClustering();
+    
+    int elapseSec = (int)((System.currentTimeMillis() - startTime) / 1000);
+    System.out.printf("elapse = %d sec\n", elapseSec);
+  }
+  
+  static void prepareInputVectors() throws IOException {
+    new File(D_IN_POINTS).mkdirs();
+    
+    File srcDirFile = new File(SRC_DIR);
+    File[] txtFiles = srcDirFile.listFiles(new FileFilter() {
+      public boolean accept(File pathname) {
+        return pathname.getName().endsWith(".txt");
+      }
+    });
+
+    Path path = new Path(F_IN_INPUT_VECTORS);
+    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf,
+        path, LongWritable.class, VectorWritable.class);
+    VectorWritable vec = new VectorWritable();
+
+    long recNum = 0;
+    for(File txtFile : txtFiles){
+      List<NamedVector> nvList = getNamedVectorsFromFile(txtFile);
+      for (Vector vector : nvList) {
+        vec.set(vector);
+        writer.append(new LongWritable(recNum++), vec);
+      }
+    }
+    
+    writer.close();
+  }
+  
+  static List<NamedVector> getNamedVectorsFromFile(File txtFile) throws 
IOException {
+    List<NamedVector> nvList = new ArrayList<NamedVector>();
+    BufferedReader br = new BufferedReader(new FileReader(txtFile));
+    String line = br.readLine();  // skip file name
+    line = br.readLine(); // skip number of lines count
+    StringBuilder sb = new StringBuilder();
+    int num = 0;
+    while((line = br.readLine()) != null){
+      String[] strValues = line.trim().split("\\s+");
+      double[] values = new double[strValues.length];
+      for(int i = 0; i < strValues.length; i++){
+        values[i] = Double.parseDouble(strValues[i]);
+      }
+      sb.setLength(0);
+      sb.append(txtFile.getName()).append('_').append(num);
+      NamedVector nv = new NamedVector(new 
RandomAccessSparseVector(values.length), sb.toString());
+      nv.assign(values);
+      nvList.add(nv);
+
+      // if preparation for initial clusters is needed then do it!
+      if(initVectors.size() < K){
+        initVectors.add(nv);
+      }
+      
+      num++;
+    }
+    br.close();
+    
+    return nvList;
+  }
+  
+  static void initClusters() throws IOException {
+    Path path = new Path(F_IN_INIT_CLUSTERS);
+    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf,
+        path, Text.class, Kluster.class);
+    
+    for (int i = 0; i < K; i++) {
+      Vector vec = initVectors.get(i);
+      Kluster cluster = new Kluster(vec, i, new EuclideanDistanceMeasure());
+      writer.append(new Text(cluster.getIdentifier()), cluster);
+    }
+    writer.close();
+  }
+
+  static void computeClustering() throws IOException, InterruptedException, 
ClassNotFoundException {
+  
+    KMeansDriver.run(conf, new Path(D_IN_POINTS), new Path(D_IN_CLUSTERS),
+        new Path(D_OUT), new EuclideanDistanceMeasure(), 0.001, MAX_ITE,
+        true, 0.1, false);
+  }
+}

Added: labs/alike/trunk/src/main/java/org/apache/alike/HistogramMatching.java
URL: 
http://svn.apache.org/viewvc/labs/alike/trunk/src/main/java/org/apache/alike/HistogramMatching.java?rev=1419182&view=auto
==============================================================================
--- labs/alike/trunk/src/main/java/org/apache/alike/HistogramMatching.java 
(added)
+++ labs/alike/trunk/src/main/java/org/apache/alike/HistogramMatching.java Mon 
Dec 10 00:28:01 2012
@@ -0,0 +1,282 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.alike;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+
+
+public class HistogramMatching {
+
+  static final int K = 500;
+  static final int D = 64;
+  // TODO: make parameterization
+  static final String SRC_DIR = 
"/Users/koji/Project/rondhuit/JAIST/SUB-THEME/out";
+  //static final String D_IN_CLUSTERS = "testdata/clusters";
+  //static final String F_IN_INIT_CLUSTERS = D_IN_CLUSTERS + "/part-00000";
+  //static final String D_OUT = "output";
+  static final String F_IN_CENTROIDS = "centroids.txt";
+
+  static Configuration conf;
+  static FileSystem fs;
+
+  public static void main(String[] args) throws IOException {
+    double[][] centroids = getCentroids();
+    Map<String, int[]> histograms = makeHistograms(centroids);
+    //printHistograms(histograms);
+    printForSolr(histograms);
+    test(histograms);
+  }
+  
+  static double[][] getCentroids() throws IOException {
+    double[][] centroids = new double[K][D];
+    
+    BufferedReader br = new BufferedReader(new FileReader(F_IN_CENTROIDS));
+    String line = null;
+    int i = 0;
+    while((line = br.readLine()) != null){
+      int sp = line.indexOf("c=[") + "c=[".length();
+      int ep = line.indexOf("] r=[");
+      //System.out.printf("\"%s\"\n", line.substring(sp, ep));
+      String[] strValues = line.substring(sp, ep).trim().split(",\\s*");
+      if(strValues.length < D){
+        // may be sparse vector representation is used...
+        for(String sv : strValues){
+          int col = sv.indexOf(':');
+          int j = Integer.parseInt(sv.substring(0, col));
+          centroids[i][j] = Double.parseDouble(sv.substring(col + 1));
+        }
+      }
+      else{
+        for(int j = 0; j < D; j++){
+          centroids[i][j] = Double.parseDouble(strValues[j]);
+        }
+      }
+      i++;
+    }
+    br.close();
+    
+    return centroids;
+  }
+  
+  static Map<String, int[]> makeHistograms(double[][] centroids) throws 
IOException {
+    Map<String, int[]> histogramMap = new HashMap<String, int[]>();
+    
+    File srcDirFile = new File(SRC_DIR);
+    File[] txtFiles = srcDirFile.listFiles(new FileFilter() {
+      public boolean accept(File pathname) {
+        return pathname.getName().endsWith(".txt");
+      }
+    });
+    
+    for(File txtFile : txtFiles){
+      String key = txtFile.getName();
+      histogramMap.put(key, makeHistgram(centroids, txtFile));
+    }
+    
+    return histogramMap;
+  }
+  
+  static int[] makeHistgram(double[][] centroids, File txtFile) throws 
IOException {
+    int[] histgram = new int[K];
+    
+    BufferedReader br = new BufferedReader(new FileReader(txtFile));
+    String line = br.readLine();  // skip file name
+    line = br.readLine(); // skip number of lines count
+    while((line = br.readLine()) != null){
+      String[] strValues = line.trim().split("\\s+");
+      double[] desc = new double[strValues.length];
+      for(int i = 0; i < strValues.length; i++){
+        desc[i] = Double.parseDouble(strValues[i]);
+      }
+      
+      voteForVisualWord(histgram, centroids, desc);
+    }
+    br.close();
+    
+    return histgram;
+  }
+
+  static void voteForVisualWord(int[] histgram, double[][] centroids, double[] 
desc){
+    double minDistance = Double.MAX_VALUE;
+    int pos = Integer.MAX_VALUE;
+    for(int i = 0; i < K; i++){
+      double distance = computeSimilarity(centroids[i], desc);
+      if(minDistance > distance){
+        minDistance = distance;
+        pos = i;
+      }
+    }
+    
+    // vote for minimum distance
+    histgram[pos]++;
+  }
+  
+  static double computeSimilarity(double[] centroid, double[] desc){
+    double sum = 0;
+    for(int i = 0; i < D; i++){
+      sum += (centroid[i] - desc[i]) * (centroid[i] - desc[i]);
+    }
+    return Math.sqrt(sum);
+  }
+  
+  static void printHistograms(Map<String, int[]> histograms){
+    for(String key : histograms.keySet()){
+      
System.out.println("\n------------------------------------------------------------");
+      System.out.println(key);
+      int[] histogram = histograms.get(key);
+      for(int i = 0; i < 100; i++){
+        int v = histogram[i];
+        for(int j = 0; j < v; j++){
+          System.out.print("*");
+        }
+        System.out.println();  // for LF
+      }
+    }
+  }
+  
+  static void printForSolr(Map<String, int[]> histograms) throws IOException {
+    PrintWriter pw = new PrintWriter("demo-data.xml");
+    pw.println("<add>");
+    for(String key : histograms.keySet()){
+      pw.println("<doc>");
+      printImageFileNameField(pw, key);
+      printAssembledQueryField(pw, histograms.get(key));
+      printHistogramField(pw, histograms.get(key));
+      pw.println("</doc>");
+    }
+    pw.println("</add>");
+    pw.close();
+  }
+  
+  private static void printImageFileNameField(PrintWriter pw, String key) 
throws IOException {
+    int idx = key.indexOf('-');
+    String value = "images/" + key.substring(0, idx) + "/" + key.substring(idx 
+ 1).replace(".txt", ".jpg");
+    printField(pw, "imgFile", value);
+  }
+  
+  private static void printAssembledQueryField(PrintWriter pw, int[] 
histogram) throws IOException {
+    StringBuilder sb = new StringBuilder();
+    for(int i = 0; i < histogram.length; i++){
+      if(histogram[i] > 0){
+        String q = Integer.toString(i) + "^" + Integer.toString(histogram[i]);
+        sb.append(q).append(' ');
+      }
+    }
+    
+    printField(pw, "query", sb.toString().trim());
+  }
+  
+  private static void printHistogramField(PrintWriter pw, int[] histogram) 
throws IOException {
+    StringBuilder sb = new StringBuilder();
+    for(int i = 0; i < histogram.length; i++){
+      int v = histogram[i];
+      for(int j = 0; j < v; j++){
+        sb.append(Integer.toString(i)).append(' ');
+      }
+    }
+    
+    printField(pw, "histogram", sb.toString().trim());
+  }
+  
+  static void printField(PrintWriter pw, String name, String value) throws 
IOException {
+    pw.printf("  <field name=\"%s\">%s</field>\n", name, value);
+  }
+  
+  static void test(Map<String, int[]> histograms){
+    Comparator<KeyScorePair> c = new KeyScorePairComparator();
+    for(String key : histograms.keySet()){
+      if(!key.endsWith("0010.txt")) continue;
+      
+      int[] srcHisto = histograms.get(key);
+      List<KeyScorePair> list = new 
ArrayList<HistogramMatching.KeyScorePair>();
+      for(Map.Entry<String, int[]> entry : histograms.entrySet()){
+        int[] destHisto = entry.getValue();
+        list.add(new KeyScorePair(entry.getKey(), cosine(srcHisto, 
destHisto)));
+        //list.add(new KeyScorePair(entry.getKey(), intersection(srcHisto, 
destHisto)));
+      }
+      Collections.sort(list, c);
+      
+      System.out.printf("\n%s\n", key);
+      for(int i = 0; i < 10; i++){
+        KeyScorePair ksp = list.get(i);
+        System.out.printf("\t%s, %f\n", ksp.key, ksp.score);
+      }
+    }
+  }
+  
+  public static double intersection(int[] v1, int[] v2){
+    int sum = 0;
+    for(int i = 0; i < v1.length; i++){
+      sum += Math.min(v1[i], v2[i]);
+    }
+    
+    return (double)sum;
+  }
+  
+  public static double cosine(int[] v1, int[] v2){
+    long numerator = 0;
+    for(int i = 0; i < v1.length; i++){
+      numerator += v1[i] * v2[i];
+    }
+    if(numerator == 0) return 0;
+    double denominator = getSumSquareRoot(v1) * getSumSquareRoot(v2);
+
+    // shouldn't be occurred, but let's avoid zero devide
+    if(denominator == 0.0) return 0;
+    
+    return numerator / denominator;
+  }
+
+  public static double getSumSquareRoot(int[] v){
+    double sum = 0;
+    for(int i = 0; i < v.length; i++){
+      sum += v[i] * v[i];
+    }
+    
+    return Math.sqrt(sum);
+  }
+  
+  static class KeyScorePair {
+    String key;
+    double score;
+    public KeyScorePair(String key, double score){
+      this.key = key;
+      this.score = score;
+    }
+  }
+  
+  static class KeyScorePairComparator implements Comparator<KeyScorePair> {
+    public int compare(KeyScorePair arg0, KeyScorePair arg1) {
+      return arg0.score > arg1.score ? -1 : 1;
+    }
+  }
+}



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

svn commit: r1419182 - in /labs/alike/trunk/src/main/java/org: ./ apache/ apache/alike/ apache/alike/Clustering.java apache/alike/HistogramMatching.java

Reply via email to