Author: kayyagari
Date: Mon Mar 25 14:02:33 2013
New Revision: 1460667
URL: http://svn.apache.org/r1460667
Log:
utility for sorting large number of keys using secondary storage
Added:
labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/
labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/BulkDataSorter.java
labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/IntTupleReaderWriter.java
labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/TupleReaderWriter.java
labs/mavibot/trunk/mavibot/src/test/java/org/apache/mavibot/btree/util/
labs/mavibot/trunk/mavibot/src/test/java/org/apache/mavibot/btree/util/BulkDataSorterTest.java
Added:
labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/BulkDataSorter.java
URL:
http://svn.apache.org/viewvc/labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/BulkDataSorter.java?rev=1460667&view=auto
==============================================================================
---
labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/BulkDataSorter.java
(added)
+++
labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/BulkDataSorter.java
Mon Mar 25 14:02:33 2013
@@ -0,0 +1,273 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.mavibot.btree.util;
+
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.lang.reflect.Array;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+import java.util.UUID;
+
+import org.apache.mavibot.btree.BTreeBuilder;
+import org.apache.mavibot.btree.Tuple;
+
+
+/**
+ * A utility class for sorting a large number of keys before building a BTree
using {@link BTreeBuilder}.
+ *
+ * @author <a href="mailto:[email protected]">Apache Directory
Project</a>
+ */
+public class BulkDataSorter<K, V>
+{
+ private File workDir;
+
+ private int splitAfter = 1000;
+
+ private Comparator<Tuple<K, V>> tupleComparator;
+
+ private TupleReaderWriter<K, V> readerWriter;
+
+ private boolean sorted;
+
+
+ public BulkDataSorter( TupleReaderWriter<K, V> readerWriter,
Comparator<Tuple<K, V>> tupleComparator,
+ int splitAfter )
+ {
+ if ( splitAfter <= 0 )
+ {
+ throw new IllegalArgumentException( "Value of splitAfter parameter
cannot be null" );
+ }
+
+ this.splitAfter = splitAfter;
+
+ this.workDir = new File( System.getProperty( "java.io.tmpdir" ),
System.currentTimeMillis() + "-sort" );
+ workDir.mkdir();
+
+ this.readerWriter = readerWriter;
+ this.tupleComparator = tupleComparator;
+ }
+
+
+ public void sort( File dataFile ) throws IOException
+ {
+ int i = 0;
+
+ Tuple<K, V>[] arr = ( Tuple<K, V>[] ) Array.newInstance( Tuple.class,
splitAfter );
+
+ Tuple<K, V> t = null;
+
+ DataInputStream in = new DataInputStream( new FileInputStream(
dataFile ) );
+
+ while ( ( t = readerWriter.readTuple( in ) ) != null )
+ {
+ arr[i++] = t;
+
+ if ( ( i % splitAfter ) == 0 )
+ {
+ i = 0;
+ Arrays.sort( arr, tupleComparator );
+
+ storeSortedData( arr );
+ }
+ }
+
+ if ( i != 0 )
+ {
+ Tuple<K, V>[] tmp = ( Tuple<K, V>[] ) Array.newInstance(
Tuple.class, i );
+ System.arraycopy( arr, 0, tmp, 0, i );
+ Arrays.sort( tmp, tupleComparator );
+
+ storeSortedData( tmp );
+ }
+
+ sorted = true;
+ }
+
+
+ private void storeSortedData( Tuple<K, V>[] arr ) throws IOException
+ {
+ File tempFile = File.createTempFile( UUID.randomUUID().toString(),
".batch", workDir );
+ DataOutputStream out = new DataOutputStream( new FileOutputStream(
tempFile ) );
+
+ for ( Tuple<K, V> t : arr )
+ {
+ readerWriter.writeTuple( t, out );
+ }
+
+ out.flush();
+ out.close();
+ }
+
+
+ public File getWorkDir()
+ {
+ return workDir;
+ }
+
+
+ public Iterator<Tuple<K, V>> getMergeSortedTuples() throws IOException
+ {
+ if ( !sorted )
+ {
+ throw new IllegalStateException( "Data is not sorted" );
+ }
+
+ File[] batches = workDir.listFiles();
+
+ if ( batches.length == 0 )
+ {
+ return Collections.EMPTY_LIST.iterator();
+ }
+
+ final DataInputStream[] streams = new DataInputStream[batches.length];
+
+ for ( int i = 0; i < batches.length; i++ )
+ {
+ streams[i] = new DataInputStream( new FileInputStream( batches[i]
) );
+ }
+
+ Iterator<Tuple<K, V>> itr = new Iterator<Tuple<K, V>>()
+ {
+ private Tuple<K, V>[] heads = ( Tuple<K, V>[] ) Array.newInstance(
Tuple.class, streams.length );
+
+ private Tuple<K, V> candidate = null;
+
+ private boolean closed;
+
+ private int candidatePos = -1;
+
+
+ @Override
+ public boolean hasNext()
+ {
+
+ if ( closed )
+ {
+ throw new IllegalStateException( "No elements to read" );
+ }
+
+ Tuple<K, V> available = null;
+
+ for ( int i = 0; i < streams.length; i++ )
+ {
+ if ( heads[i] == null )
+ {
+ heads[i] = readerWriter.readTuple( streams[i] );
+ }
+
+ if ( available == null )
+ {
+ available = heads[i];
+ candidatePos = i;
+ }
+ else
+ {
+ if ( ( available != null ) && ( heads[i] != null ) )
+ {
+ int comp = tupleComparator.compare( heads[i],
available );
+ if ( comp <= 0 )
+ {
+ available = heads[i];
+ candidatePos = i;
+ }
+ }
+ }
+ }
+
+ heads[candidatePos] = null;
+
+ if ( available == null )
+ {
+ for ( int i = 0; i < streams.length; i++ )
+ {
+ if ( heads[i] != null )
+ {
+ available = heads[i];
+ heads[i] = readerWriter.readTuple( streams[i] );
+ break;
+ }
+ }
+ }
+
+ if ( available != null )
+ {
+ candidate = available;
+ return true;
+ }
+
+ // finally close the streams
+ for ( DataInputStream in : streams )
+ {
+ try
+ {
+ in.close();
+ }
+ catch ( Exception e )
+ {
+ e.printStackTrace();
+ }
+ }
+
+ closed = true;
+
+ return false;
+ }
+
+
+ @Override
+ public Tuple<K, V> next()
+ {
+ if ( candidate == null )
+ {
+ if ( !closed )
+ {
+ hasNext();
+ }
+ }
+
+ if ( candidate == null )
+ {
+ throw new NoSuchElementException( "No tuples found" );
+ }
+
+ return candidate;
+ }
+
+
+ @Override
+ public void remove()
+ {
+ throw new UnsupportedOperationException( "Not supported" );
+ }
+
+ };
+
+ return itr;
+ }
+}
Added:
labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/IntTupleReaderWriter.java
URL:
http://svn.apache.org/viewvc/labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/IntTupleReaderWriter.java?rev=1460667&view=auto
==============================================================================
---
labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/IntTupleReaderWriter.java
(added)
+++
labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/IntTupleReaderWriter.java
Mon Mar 25 14:02:33 2013
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.mavibot.btree.util;
+
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+
+import org.apache.mavibot.btree.Tuple;
+
+
+/**
+ * TODO IntTupleReaderWriter.
+ *
+ * @author <a href="mailto:[email protected]">Apache Directory
Project</a>
+ */
+public class IntTupleReaderWriter implements TupleReaderWriter<Integer,
Integer>
+{
+
+ @Override
+ public void writeTuple( Tuple<Integer, Integer> t, DataOutputStream out )
throws IOException
+ {
+ out.writeInt( t.getKey() );
+ out.writeInt( t.getValue() );
+ }
+
+
+ @Override
+ public Tuple<Integer, Integer> readTuple( DataInputStream in )
+ {
+
+ try
+ {
+ if ( in.available() <= 0 )
+ {
+ return null;
+ }
+
+ Tuple<Integer, Integer> t = new Tuple<Integer, Integer>(
in.readInt(), in.readInt() );
+
+ return t;
+ }
+ catch ( IOException e )
+ {
+ }
+
+ return null;
+ }
+}
Added:
labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/TupleReaderWriter.java
URL:
http://svn.apache.org/viewvc/labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/TupleReaderWriter.java?rev=1460667&view=auto
==============================================================================
---
labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/TupleReaderWriter.java
(added)
+++
labs/mavibot/trunk/mavibot/src/main/java/org/apache/mavibot/btree/util/TupleReaderWriter.java
Mon Mar 25 14:02:33 2013
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.mavibot.btree.util;
+
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+
+import org.apache.mavibot.btree.Tuple;
+
+
+/**
+ * TODO TupleReaderWriter.
+ *
+ * @author <a href="mailto:[email protected]">Apache Directory
Project</a>
+ */
+public interface TupleReaderWriter<K, V>
+{
+ Tuple<K, V> readTuple( DataInputStream in );
+
+
+ void writeTuple( Tuple<K, V> t, DataOutputStream out ) throws IOException;
+}
Added:
labs/mavibot/trunk/mavibot/src/test/java/org/apache/mavibot/btree/util/BulkDataSorterTest.java
URL:
http://svn.apache.org/viewvc/labs/mavibot/trunk/mavibot/src/test/java/org/apache/mavibot/btree/util/BulkDataSorterTest.java?rev=1460667&view=auto
==============================================================================
---
labs/mavibot/trunk/mavibot/src/test/java/org/apache/mavibot/btree/util/BulkDataSorterTest.java
(added)
+++
labs/mavibot/trunk/mavibot/src/test/java/org/apache/mavibot/btree/util/BulkDataSorterTest.java
Mon Mar 25 14:02:33 2013
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.mavibot.btree.util;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.lang.reflect.Array;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.Random;
+
+import org.apache.mavibot.btree.Tuple;
+import org.junit.Test;
+
+/**
+ * Test cases for BulkDataSorter.
+ *
+ * @author <a href="mailto:[email protected]">Apache Directory
Project</a>
+ */
+public class BulkDataSorterTest
+{
+
+ private Comparator<Tuple<Integer, Integer>> tupleComp = new
Comparator<Tuple<Integer, Integer>>()
+ {
+
+ @Override
+ public int compare( Tuple<Integer, Integer> o1, Tuple<Integer,
Integer> o2 )
+ {
+ return o1.getKey().compareTo( o2.getKey() );
+ }
+ };
+
+
+ @Test
+ public void testSortedFileCount() throws IOException
+ {
+ int count = 7;
+ IntTupleReaderWriter itrw = new IntTupleReaderWriter();
+ Random random = new Random();
+
+ File dataFile = File.createTempFile( "tuple", ".data" );
+ dataFile.deleteOnExit();
+ DataOutputStream out = new DataOutputStream( new FileOutputStream(
dataFile ) );
+
+ Tuple<Integer, Integer>[] arr = (Tuple<Integer, Integer>[])
Array.newInstance( Tuple.class, count );
+
+ for ( int i = 0; i < count; i++ )
+ {
+ int x = random.nextInt(100);
+ //System.out.println(x);
+
+ Tuple<Integer, Integer> t = new Tuple<Integer, Integer>( x, x );
+
+ arr[i] = t;
+
+ itrw.writeTuple( t, out );
+ }
+
+ out.close();
+
+ BulkDataSorter<Integer, Integer> bds = new BulkDataSorter<Integer,
Integer>( itrw, tupleComp, 4 );
+ bds.sort( dataFile );
+
+ assertEquals(2, bds.getWorkDir().list().length);
+
+ deleteDir( bds.getWorkDir() );
+ }
+
+ @Test
+ public void testSortedFileMerge() throws IOException
+ {
+ testSortedFileMerge( 10, 2 );
+ testSortedFileMerge( 100, 7 );
+ testSortedFileMerge( 1000, 25 );
+ testSortedFileMerge( 10000, 100 );
+ testSortedFileMerge( 10000, 101 );
+ testSortedFileMerge( 100000, 501 );
+ }
+
+ public void testSortedFileMerge(int count, int splitAfter) throws
IOException
+ {
+ IntTupleReaderWriter itrw = new IntTupleReaderWriter();
+ Random random = new Random();
+
+ File dataFile = File.createTempFile( "tuple", ".data" );
+ dataFile.deleteOnExit();
+
+ DataOutputStream out = new DataOutputStream( new FileOutputStream(
dataFile ) );
+
+ Tuple<Integer, Integer>[] arr = (Tuple<Integer, Integer>[])
Array.newInstance( Tuple.class, count );
+
+ int randUpper = count;
+ if(count < 100)
+ {
+ randUpper = 100;
+ }
+
+ for ( int i = 0; i < count; i++ )
+ {
+ int x = random.nextInt(randUpper);
+ //System.out.println(x);
+
+ Tuple<Integer, Integer> t = new Tuple<Integer, Integer>( x, x );
+
+ arr[i] = t;
+
+ itrw.writeTuple( t, out );
+ }
+
+ out.close();
+
+ BulkDataSorter<Integer, Integer> bds = new BulkDataSorter<Integer,
Integer>( itrw, tupleComp, splitAfter );
+ bds.sort( dataFile );
+
+ Iterator<Tuple<Integer,Integer>> itr = bds.getMergeSortedTuples();
+
+ Integer prev = null;
+ while(itr.hasNext())
+ {
+ Tuple<Integer,Integer> t = itr.next();
+
+ if(prev == null)
+ {
+ prev = t.getKey();
+ }
+ else
+ {
+ assertTrue( prev <= t.getKey() );
+ }
+
+ //System.out.println(t);
+ }
+
+ deleteDir( bds.getWorkDir() );
+ }
+
+
+ private void deleteDir(File dir)
+ {
+ if(dir.isFile())
+ {
+ dir.delete();
+ }
+
+ File[] files = dir.listFiles();
+
+ for(File f: files)
+ {
+ f.delete();
+ }
+
+ dir.delete();
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]