Author: jbellis
Date: Fri Mar 5 20:31:18 2010
New Revision: 919591
URL: http://svn.apache.org/viewvc?rev=919591&view=rev
Log:
Add javadoc for Hadoop classes. patch by jbellis; reviewed by johano for
CASSANDRA-837
Modified:
incubator/cassandra/branches/cassandra-0.6/CHANGES.txt
incubator/cassandra/branches/cassandra-0.6/src/java/org/apache/cassandra/hadoop/ColumnFamilyInputFormat.java
incubator/cassandra/branches/cassandra-0.6/src/java/org/apache/cassandra/hadoop/ConfigHelper.java
Modified: incubator/cassandra/branches/cassandra-0.6/CHANGES.txt
URL:
http://svn.apache.org/viewvc/incubator/cassandra/branches/cassandra-0.6/CHANGES.txt?rev=919591&r1=919590&r2=919591&view=diff
==============================================================================
--- incubator/cassandra/branches/cassandra-0.6/CHANGES.txt (original)
+++ incubator/cassandra/branches/cassandra-0.6/CHANGES.txt Fri Mar 5 20:31:18
2010
@@ -12,6 +12,7 @@
until it's too late (CASSANDRA-843)
* Add logging of GC activity (CASSANDRA-813)
* fix ConcurrentModificationException in commitlog discard (CASSANDRA-853)
+ * Fix hardcoded row count in Hadoop RecordReader (CASSANDRA-837)
0.6.0-beta1/beta2
Modified:
incubator/cassandra/branches/cassandra-0.6/src/java/org/apache/cassandra/hadoop/ColumnFamilyInputFormat.java
URL:
http://svn.apache.org/viewvc/incubator/cassandra/branches/cassandra-0.6/src/java/org/apache/cassandra/hadoop/ColumnFamilyInputFormat.java?rev=919591&r1=919590&r2=919591&view=diff
==============================================================================
---
incubator/cassandra/branches/cassandra-0.6/src/java/org/apache/cassandra/hadoop/ColumnFamilyInputFormat.java
(original)
+++
incubator/cassandra/branches/cassandra-0.6/src/java/org/apache/cassandra/hadoop/ColumnFamilyInputFormat.java
Fri Mar 5 20:31:18 2010
@@ -37,6 +37,24 @@
import org.apache.thrift.transport.TSocket;
import org.apache.thrift.transport.TTransportException;
+/**
+ * Hadoop InputFormat allowing map/reduce against Cassandra rows within one
ColumnFamily.
+ *
+ * At minimum, you need to set the CF and predicate (description of columns to
extract from each row)
+ * in your Hadoop job Configuration. The ConfigHelper class is provided to
make this
+ * simple:
+ * ConfigHelper.setColumnFamily
+ * ConfigHelper.setSlicePredicate
+ *
+ * You can also configure the number of rows per InputSplit with
+ * ConfigHelper.setInputSplitSize
+ * This should be "as big as possible, but no bigger." Each InputSplit is
read from Cassandra
+ * with a single get_slice_range query, and the per-call overhead of
get_slice_range is high,
+ * so larger split sizes are better -- but if it is too large, you will run
out of memory,
+ * since no paging is done (yet).
+ *
+ * The default split size is 4096 rows.
+ */
public class ColumnFamilyInputFormat extends InputFormat<String,
SortedMap<byte[], IColumn>>
{
Modified:
incubator/cassandra/branches/cassandra-0.6/src/java/org/apache/cassandra/hadoop/ConfigHelper.java
URL:
http://svn.apache.org/viewvc/incubator/cassandra/branches/cassandra-0.6/src/java/org/apache/cassandra/hadoop/ConfigHelper.java?rev=919591&r1=919590&r2=919591&view=diff
==============================================================================
---
incubator/cassandra/branches/cassandra-0.6/src/java/org/apache/cassandra/hadoop/ConfigHelper.java
(original)
+++
incubator/cassandra/branches/cassandra-0.6/src/java/org/apache/cassandra/hadoop/ConfigHelper.java
Fri Mar 5 20:31:18 2010
@@ -16,7 +16,15 @@
private static final String COLUMNFAMILY_CONFIG =
"cassandra.input.columnfamily";
private static final String PREDICATE_CONFIG = "cassandra.input.predicate";
private static final String INPUT_SPLIT_SIZE_CONFIG =
"cassandra.input.split.size";
+ private static final int DEFAULT_SPLIT_SIZE = 4096;
+ /**
+ * Set the keyspace and column family for this job.
+ *
+ * @param conf Job configuration you are about to run
+ * @param keyspace
+ * @param columnFamily
+ */
public static void setColumnFamily(Configuration conf, String keyspace,
String columnFamily)
{
if (keyspace == null)
@@ -44,7 +52,7 @@
* This affects the number of maps created, if the number is too small
* the overhead of each map will take up the bulk of the job time.
*
- * @param conf Job configuration you are about to run.
+ * @param conf Job configuration you are about to run
* @param splitsize Size of the input split
*/
public static void setInputSplitSize(Configuration conf, int splitsize)
@@ -54,9 +62,15 @@
public static int getInputSplitSize(Configuration conf)
{
- return conf.getInt(INPUT_SPLIT_SIZE_CONFIG, 4096);
+ return conf.getInt(INPUT_SPLIT_SIZE_CONFIG, DEFAULT_SPLIT_SIZE);
}
+ /**
+ * Set the predicate that determines what columns will be selected from
each row.
+ *
+ * @param conf Job configuration you are about to run
+ * @param predicate
+ */
public static void setSlicePredicate(Configuration conf, SlicePredicate
predicate)
{
conf.set(PREDICATE_CONFIG, predicateToString(predicate));