Author: kayyagari
Date: Mon Mar 25 14:02:33 2013
New Revision: 1460667

URL: http://svn.apache.org/r1460667
Log:
utility for sorting large number of keys using secondary storage

Added:
    labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/
    
labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/BulkDataSorter.java
    
labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/IntTupleReaderWriter.java
    
labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/TupleReaderWriter.java
    labs/mavibot/trunk/mavibot/src/test/java/org/apache/mavibot/btree/util/
    
labs/mavibot/trunk/mavibot/src/test/java/org/apache/mavibot/btree/util/BulkDataSorterTest.java

Added: 
labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/BulkDataSorter.java
URL: 
http://svn.apache.org/viewvc/labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/BulkDataSorter.java?rev=1460667&view=auto
==============================================================================
--- 
labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/BulkDataSorter.java
 (added)
+++ 
labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/BulkDataSorter.java
 Mon Mar 25 14:02:33 2013
@@ -0,0 +1,273 @@
+/*
+ *   Licensed to the Apache Software Foundation (ASF) under one
+ *   or more contributor license agreements.  See the NOTICE file
+ *   distributed with this work for additional information
+ *   regarding copyright ownership.  The ASF licenses this file
+ *   to you under the Apache License, Version 2.0 (the
+ *   "License"); you may not use this file except in compliance
+ *   with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing,
+ *   software distributed under the License is distributed on an
+ *   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *   KIND, either express or implied.  See the License for the
+ *   specific language governing permissions and limitations
+ *   under the License.
+ *
+ */
+package org.apache.mavibot.btree.util;
+
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.lang.reflect.Array;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+import java.util.UUID;
+
+import org.apache.mavibot.btree.BTreeBuilder;
+import org.apache.mavibot.btree.Tuple;
+
+
+/**
+ * A utility class for sorting a large number of keys before building a BTree 
using {@link BTreeBuilder}.
+ *
+ * @author <a href="mailto:[email protected]";>Apache Directory 
Project</a>
+ */
+public class BulkDataSorter<K, V>
+{
+    private File workDir;
+
+    private int splitAfter = 1000;
+
+    private Comparator<Tuple<K, V>> tupleComparator;
+
+    private TupleReaderWriter<K, V> readerWriter;
+
+    private boolean sorted;
+
+
+    public BulkDataSorter( TupleReaderWriter<K, V> readerWriter, 
Comparator<Tuple<K, V>> tupleComparator,
+        int splitAfter )
+    {
+        if ( splitAfter <= 0 )
+        {
+            throw new IllegalArgumentException( "Value of splitAfter parameter 
cannot be null" );
+        }
+
+        this.splitAfter = splitAfter;
+
+        this.workDir = new File( System.getProperty( "java.io.tmpdir" ), 
System.currentTimeMillis() + "-sort" );
+        workDir.mkdir();
+
+        this.readerWriter = readerWriter;
+        this.tupleComparator = tupleComparator;
+    }
+
+
+    public void sort( File dataFile ) throws IOException
+    {
+        int i = 0;
+
+        Tuple<K, V>[] arr = ( Tuple<K, V>[] ) Array.newInstance( Tuple.class, 
splitAfter );
+
+        Tuple<K, V> t = null;
+
+        DataInputStream in = new DataInputStream( new FileInputStream( 
dataFile ) );
+
+        while ( ( t = readerWriter.readTuple( in ) ) != null )
+        {
+            arr[i++] = t;
+
+            if ( ( i % splitAfter ) == 0 )
+            {
+                i = 0;
+                Arrays.sort( arr, tupleComparator );
+
+                storeSortedData( arr );
+            }
+        }
+
+        if ( i != 0 )
+        {
+            Tuple<K, V>[] tmp = ( Tuple<K, V>[] ) Array.newInstance( 
Tuple.class, i );
+            System.arraycopy( arr, 0, tmp, 0, i );
+            Arrays.sort( tmp, tupleComparator );
+
+            storeSortedData( tmp );
+        }
+
+        sorted = true;
+    }
+
+
+    private void storeSortedData( Tuple<K, V>[] arr ) throws IOException
+    {
+        File tempFile = File.createTempFile( UUID.randomUUID().toString(), 
".batch", workDir );
+        DataOutputStream out = new DataOutputStream( new FileOutputStream( 
tempFile ) );
+
+        for ( Tuple<K, V> t : arr )
+        {
+            readerWriter.writeTuple( t, out );
+        }
+
+        out.flush();
+        out.close();
+    }
+
+
+    public File getWorkDir()
+    {
+        return workDir;
+    }
+
+
+    public Iterator<Tuple<K, V>> getMergeSortedTuples() throws IOException
+    {
+        if ( !sorted )
+        {
+            throw new IllegalStateException( "Data is not sorted" );
+        }
+
+        File[] batches = workDir.listFiles();
+
+        if ( batches.length == 0 )
+        {
+            return Collections.EMPTY_LIST.iterator();
+        }
+
+        final DataInputStream[] streams = new DataInputStream[batches.length];
+
+        for ( int i = 0; i < batches.length; i++ )
+        {
+            streams[i] = new DataInputStream( new FileInputStream( batches[i] 
) );
+        }
+
+        Iterator<Tuple<K, V>> itr = new Iterator<Tuple<K, V>>()
+        {
+            private Tuple<K, V>[] heads = ( Tuple<K, V>[] ) Array.newInstance( 
Tuple.class, streams.length );
+
+            private Tuple<K, V> candidate = null;
+
+            private boolean closed;
+
+            private int candidatePos = -1;
+
+
+            @Override
+            public boolean hasNext()
+            {
+
+                if ( closed )
+                {
+                    throw new IllegalStateException( "No elements to read" );
+                }
+
+                Tuple<K, V> available = null;
+
+                for ( int i = 0; i < streams.length; i++ )
+                {
+                    if ( heads[i] == null )
+                    {
+                        heads[i] = readerWriter.readTuple( streams[i] );
+                    }
+
+                    if ( available == null )
+                    {
+                        available = heads[i];
+                        candidatePos = i;
+                    }
+                    else
+                    {
+                        if ( ( available != null ) && ( heads[i] != null ) )
+                        {
+                            int comp = tupleComparator.compare( heads[i], 
available );
+                            if ( comp <= 0 )
+                            {
+                                available = heads[i];
+                                candidatePos = i;
+                            }
+                        }
+                    }
+                }
+
+                heads[candidatePos] = null;
+
+                if ( available == null )
+                {
+                    for ( int i = 0; i < streams.length; i++ )
+                    {
+                        if ( heads[i] != null )
+                        {
+                            available = heads[i];
+                            heads[i] = readerWriter.readTuple( streams[i] );
+                            break;
+                        }
+                    }
+                }
+
+                if ( available != null )
+                {
+                    candidate = available;
+                    return true;
+                }
+
+                // finally close the streams
+                for ( DataInputStream in : streams )
+                {
+                    try
+                    {
+                        in.close();
+                    }
+                    catch ( Exception e )
+                    {
+                        e.printStackTrace();
+                    }
+                }
+
+                closed = true;
+
+                return false;
+            }
+
+
+            @Override
+            public Tuple<K, V> next()
+            {
+                if ( candidate == null )
+                {
+                    if ( !closed )
+                    {
+                        hasNext();
+                    }
+                }
+
+                if ( candidate == null )
+                {
+                    throw new NoSuchElementException( "No tuples found" );
+                }
+
+                return candidate;
+            }
+
+
+            @Override
+            public void remove()
+            {
+                throw new UnsupportedOperationException( "Not supported" );
+            }
+
+        };
+
+        return itr;
+    }
+}

Added: 
labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/IntTupleReaderWriter.java
URL: 
http://svn.apache.org/viewvc/labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/IntTupleReaderWriter.java?rev=1460667&view=auto
==============================================================================
--- 
labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/IntTupleReaderWriter.java
 (added)
+++ 
labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/IntTupleReaderWriter.java
 Mon Mar 25 14:02:33 2013
@@ -0,0 +1,67 @@
+/*
+ *   Licensed to the Apache Software Foundation (ASF) under one
+ *   or more contributor license agreements.  See the NOTICE file
+ *   distributed with this work for additional information
+ *   regarding copyright ownership.  The ASF licenses this file
+ *   to you under the Apache License, Version 2.0 (the
+ *   "License"); you may not use this file except in compliance
+ *   with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing,
+ *   software distributed under the License is distributed on an
+ *   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *   KIND, either express or implied.  See the License for the
+ *   specific language governing permissions and limitations
+ *   under the License.
+ *
+ */
+package org.apache.mavibot.btree.util;
+
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+
+import org.apache.mavibot.btree.Tuple;
+
+
+/**
+ * TODO IntTupleReaderWriter.
+ *
+ * @author <a href="mailto:[email protected]";>Apache Directory 
Project</a>
+ */
+public class IntTupleReaderWriter implements TupleReaderWriter<Integer, 
Integer>
+{
+
+    @Override
+    public void writeTuple( Tuple<Integer, Integer> t, DataOutputStream out ) 
throws IOException
+    {
+        out.writeInt( t.getKey() );
+        out.writeInt( t.getValue() );
+    }
+
+
+    @Override
+    public Tuple<Integer, Integer> readTuple( DataInputStream in )
+    {
+
+        try
+        {
+            if ( in.available() <= 0 )
+            {
+                return null;
+            }
+
+            Tuple<Integer, Integer> t = new Tuple<Integer, Integer>( 
in.readInt(), in.readInt() );
+
+            return t;
+        }
+        catch ( IOException e )
+        {
+        }
+
+        return null;
+    }
+}

Added: 
labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/TupleReaderWriter.java
URL: 
http://svn.apache.org/viewvc/labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/TupleReaderWriter.java?rev=1460667&view=auto
==============================================================================
--- 
labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/TupleReaderWriter.java
 (added)
+++ 
labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/TupleReaderWriter.java
 Mon Mar 25 14:02:33 2013
@@ -0,0 +1,41 @@
+/*
+ *   Licensed to the Apache Software Foundation (ASF) under one
+ *   or more contributor license agreements.  See the NOTICE file
+ *   distributed with this work for additional information
+ *   regarding copyright ownership.  The ASF licenses this file
+ *   to you under the Apache License, Version 2.0 (the
+ *   "License"); you may not use this file except in compliance
+ *   with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing,
+ *   software distributed under the License is distributed on an
+ *   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *   KIND, either express or implied.  See the License for the
+ *   specific language governing permissions and limitations
+ *   under the License.
+ *
+ */
+package org.apache.mavibot.btree.util;
+
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+
+import org.apache.mavibot.btree.Tuple;
+
+
+/**
+ * TODO TupleReaderWriter.
+ *
+ * @author <a href="mailto:[email protected]";>Apache Directory 
Project</a>
+ */
+public interface TupleReaderWriter<K, V>
+{
+    Tuple<K, V> readTuple( DataInputStream in );
+
+
+    void writeTuple( Tuple<K, V> t, DataOutputStream out ) throws IOException;
+}

Added: 
labs/mavibot/trunk/mavibot/src/test/java/org/apache/mavibot/btree/util/BulkDataSorterTest.java
URL: 
http://svn.apache.org/viewvc/labs/mavibot/trunk/mavibot/src/test/java/org/apache/mavibot/btree/util/BulkDataSorterTest.java?rev=1460667&view=auto
==============================================================================
--- 
labs/mavibot/trunk/mavibot/src/test/java/org/apache/mavibot/btree/util/BulkDataSorterTest.java
 (added)
+++ 
labs/mavibot/trunk/mavibot/src/test/java/org/apache/mavibot/btree/util/BulkDataSorterTest.java
 Mon Mar 25 14:02:33 2013
@@ -0,0 +1,176 @@
+/*
+ *   Licensed to the Apache Software Foundation (ASF) under one
+ *   or more contributor license agreements.  See the NOTICE file
+ *   distributed with this work for additional information
+ *   regarding copyright ownership.  The ASF licenses this file
+ *   to you under the Apache License, Version 2.0 (the
+ *   "License"); you may not use this file except in compliance
+ *   with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing,
+ *   software distributed under the License is distributed on an
+ *   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *   KIND, either express or implied.  See the License for the
+ *   specific language governing permissions and limitations
+ *   under the License.
+ *
+ */
+package org.apache.mavibot.btree.util;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.lang.reflect.Array;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.Random;
+
+import org.apache.mavibot.btree.Tuple;
+import org.junit.Test;
+
+/**
+ * Test cases for BulkDataSorter.
+ *
+ * @author <a href="mailto:[email protected]";>Apache Directory 
Project</a>
+ */
+public class BulkDataSorterTest
+{
+
+    private Comparator<Tuple<Integer, Integer>> tupleComp = new 
Comparator<Tuple<Integer, Integer>>()
+    {
+        
+        @Override
+        public int compare( Tuple<Integer, Integer> o1, Tuple<Integer, 
Integer> o2 )
+        {
+            return o1.getKey().compareTo( o2.getKey() );
+        }
+    };
+
+    
+    @Test
+    public void testSortedFileCount() throws IOException
+    {
+        int count = 7;
+        IntTupleReaderWriter itrw = new IntTupleReaderWriter();
+        Random random = new Random();
+        
+        File dataFile = File.createTempFile( "tuple", ".data" );
+        dataFile.deleteOnExit();
+        DataOutputStream out = new DataOutputStream( new FileOutputStream( 
dataFile ) );
+        
+        Tuple<Integer, Integer>[] arr = (Tuple<Integer, Integer>[]) 
Array.newInstance( Tuple.class, count );
+        
+        for ( int i = 0; i < count; i++ )
+        {
+            int x = random.nextInt(100);
+            //System.out.println(x);
+
+            Tuple<Integer, Integer> t = new Tuple<Integer, Integer>( x, x );
+            
+            arr[i] = t;
+            
+            itrw.writeTuple( t, out );
+        }
+
+        out.close();
+        
+        BulkDataSorter<Integer, Integer> bds = new BulkDataSorter<Integer, 
Integer>( itrw, tupleComp, 4 );
+        bds.sort( dataFile );
+        
+        assertEquals(2, bds.getWorkDir().list().length);
+        
+        deleteDir( bds.getWorkDir() );
+    }
+    
+    @Test
+    public void testSortedFileMerge() throws IOException
+    {
+        testSortedFileMerge( 10, 2 );
+        testSortedFileMerge( 100, 7 );
+        testSortedFileMerge( 1000, 25 );
+        testSortedFileMerge( 10000, 100 );
+        testSortedFileMerge( 10000, 101 );
+        testSortedFileMerge( 100000, 501 );
+    }
+    
+    public void testSortedFileMerge(int count, int splitAfter) throws 
IOException
+    {
+        IntTupleReaderWriter itrw = new IntTupleReaderWriter();
+        Random random = new Random();
+        
+        File dataFile = File.createTempFile( "tuple", ".data" );
+        dataFile.deleteOnExit();
+        
+        DataOutputStream out = new DataOutputStream( new FileOutputStream( 
dataFile ) );
+        
+        Tuple<Integer, Integer>[] arr = (Tuple<Integer, Integer>[]) 
Array.newInstance( Tuple.class, count );
+        
+        int randUpper = count;
+        if(count < 100)
+        {
+            randUpper = 100;
+        }
+        
+        for ( int i = 0; i < count; i++ )
+        {
+            int x = random.nextInt(randUpper);
+            //System.out.println(x);
+
+            Tuple<Integer, Integer> t = new Tuple<Integer, Integer>( x, x );
+            
+            arr[i] = t;
+            
+            itrw.writeTuple( t, out );
+        }
+
+        out.close();
+        
+        BulkDataSorter<Integer, Integer> bds = new BulkDataSorter<Integer, 
Integer>( itrw, tupleComp, splitAfter );
+        bds.sort( dataFile );
+        
+        Iterator<Tuple<Integer,Integer>> itr = bds.getMergeSortedTuples();
+        
+        Integer prev = null;
+        while(itr.hasNext())
+        {
+            Tuple<Integer,Integer> t = itr.next();
+            
+            if(prev == null)
+            {
+                prev = t.getKey();
+            }
+            else
+            {
+                assertTrue( prev <= t.getKey() );
+            }
+            
+            //System.out.println(t);
+        }
+        
+        deleteDir( bds.getWorkDir() );
+    }
+    
+    
+    private void deleteDir(File dir)
+    {
+        if(dir.isFile())
+        {
+            dir.delete();
+        }
+        
+        File[] files = dir.listFiles();
+        
+        for(File f: files)
+        {
+            f.delete();
+        }
+        
+        dir.delete();
+    }
+}



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to