//package com.yoterra.se.afp;
import java.io.IOException;
//
// Sample Hbase data importer
// reads from a file called cdr.data and injects into the cdrs table
reporting the date / time every 1000 commits
// by Anders Brownworth
import java.io.*;
import java.util.*;
//import org.apache.log4j.BasicConfigurator;
//import org.apache.log4j.Level;
//import org.apache.log4j.Logger;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.io.*;
import org.apache.hadoop.hbase.client.*;
public class ArcParserImporter {
private static String rl( FileInputStream in ) {
try {
int i;
char c[] = new char[1];
String s = new String( "" );
while ( ( i = in.read( )) != 10 )
if( i == -1 ) return null;
else {
c[0] = (char) i;
s = s + new String( c );
}
// System.out.println( i + "=" + s );
//c[0] = (char) i;
//s = s + new String( c );
return s;
}
catch ( IOException ex ) {
ex.printStackTrace( );
return null;
}
}
public static void main( String args[] ) throws IOException {
System.out.print( "starting " + new Date() + "..." );
try {
int counter = 0;
int act =0;
int limit=0;
int exno=0;
boolean errf=false;
if (args.length == 0) {
System.out.println("[ERROR] exiting program - no
arc file passed as arg");
System.exit(1);
}
FileInputStream in = new FileInputStream( args[0] ) ;
String line;
// read header and forget about it
// first line has 5 token, the one before 0
// System.out.println( "Header ... " );
int i = 0;
int t = 0;
char[] c = new char[1];
do {
line = rl( in );
StringTokenizer st = new StringTokenizer( line, " " );
i++;
t = st.countTokens( );
// System.out.println( "^^^" + t );
} while ( t != 0 );
// should be done with header, now for the interesting part
//Let's open the HBase connection...
HBaseConfiguration hc = new HBaseConfiguration( new
Configuration( ) );
HTable ht = new HTable( hc, "yotest1" );
// We have table handle now and can start to loop over the
// Arc file content.
do {
StringTokenizer st = new StringTokenizer( line, " " );
if ( st.countTokens( ) == 5 ) {
String url = st.nextToken( );
String ip = st.nextToken( );
String ts = st.nextToken( );
String mime = st.nextToken( );
String len = st.nextToken( );
// System.out.println( url + " " + ip + " " + ts + " " + mime + "
" + len );
java.util.Scanner sk = new java.util.Scanner(
len );
try {
limit = sk.nextInt();
byte[] body = new byte[limit];
//-line.length()];
act = in.read( body, 0 , limit);
//-line.length());
// We have read the header and body now
if it is html we
// can do the clustering process and
write the result into hbase
if( mime.compareTo( "text/html" ) == 0
) {
counter++;
// write it into hbase now
errf = false;
exno = 0;
do {
try {
BatchUpdate bu
= new BatchUpdate( url );
bu.put( "crawltime:", ts.getBytes() );
bu.put( "ip:",
ip.getBytes() );
bu.put( "mime:",
mime.getBytes() );
// need to parse first
bu.put( "respcode:", body );
// need to calculate first
bu.put( "offset:", etBytes() );
bu.put( "size:",
len.getBytes() );
// bu.put( "file:",
arc.getBytes() );
bu.put( "resp:",
body );
String clno =
"" + counter;
bu.put(
"clusterno:", clno.getBytes() );
ht.commit( bu );
} catch ( IOException
aex )
{
exno++;
System.out.println(
"IO Exception No=" + exno );
try {
Thread.sleep( 60000 );
} catch(
InterruptedException ee )
{
System.out.println( "Time to exit..." );
System.exit(1);
}
hc = new
HBaseConfiguration( new Configuration( ) );
ht = new HTable( hc,
"yotest1" );
errf = true;
}
} while ( errf );
}
}
catch ( InputMismatchException ex )
{ System.out.println( "Odd..." ); }
//` catch (SimpleHtmlParserException e) {
e.printStackTrace(); }
}
} while ( ( line = rl( in ) ) != null );
System.out.println( "Wrote " + counter + " pages " + new Date()
);
} catch ( IOException ex ) { ex.printStackTrace( ); }
// } catch (Exception e) {
// e.printStackTrace();
// }
}
}
On Jan 17, 2009, at 3:37 PM, stack wrote:
Derek Pappas wrote:
No. See attached program. It parses the arc files and writes the
html records to hbase.
5 data nodes and 3 regions.
I don't believe this list allows attachments (Program did not come
across). Put it up in pastebin?
Single threaded.
How many instances do you have running? One only?
Tell us what you are seeing in your logs so we can help. Make
sure you have DEBUG enabled (see earlier in the FAQ that J-D
pointed you at for how).
Errors posted below, datanodes complaining of blocks, as J-D
indicates, should be addressed mostly by the troubleshooting
section he pointed you to. You might also check datanode logs for
errors. Could help give us a clue why the failures.
Meantime, how many regions when it fails? Tell us about your
schema and your hardware.
Dell 850's. Super Micro core duo's and a quad core.
5 data nodes 3 regions
Add your configuration to pastebin too. Whats your schema like?
How many column families?
Anything else running on these systems? They should be well able
(How much RAM -- are you swapping?).
Enable DEBUG and paste exceptions from regionserver logs including
the lines that lead up to the exception.
Check your datanode logs too.
St.Ack
Best Regards,
Derek Pappas
depappas at yahoo d0t com