Re: production usage of HBase

Derek Pappas Sun, 18 Jan 2009 00:17:31 -0800

//package com.yoterra.se.afp;

import java.io.IOException;


//
// Sample Hbase data importer

// reads from a file called cdr.data and injects into the cdrs tablereporting the date / time every 1000 commits

//   by   Anders Brownworth
import java.io.*;
import java.util.*;
//import org.apache.log4j.BasicConfigurator;
//import org.apache.log4j.Level;
//import org.apache.log4j.Logger;

import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.io.*;
import org.apache.hadoop.hbase.client.*;

public class ArcParserImporter {
        
    private static String rl( FileInputStream in ) {
                try {
                int i;
                char c[] = new char[1];
                String s = new String( "" );
                while ( ( i = in.read( )) != 10 )
                    if( i == -1 ) return null;
                    else {
                        c[0] = (char) i;
                        s = s + new String( c );
                    }
                //    System.out.println( i + "=" + s );
                //c[0] = (char) i;
                //s = s + new String( c );
                return s;

                }
                catch ( IOException ex ) {
                 ex.printStackTrace( );
                return null;
                }
        }

    public static void main( String args[] ) throws IOException {
        System.out.print( "starting " + new Date() + "..." );

        try {

                int counter = 0;
                int act =0;
                int limit=0;
                int exno=0;
                boolean errf=false;

                if (args.length == 0) {

System.out.println("[ERROR] exiting program - noarc file passed as arg");

                    System.exit(1);
                }

                FileInputStream in = new FileInputStream( args[0] ) ;
                String line;
        
                // read header and forget about it
                // first line has 5 token, the one before 0
                // System.out.println( "Header ... " );
                int i = 0;
                int t = 0;
                char[] c = new char[1];
                do {
                        line = rl( in  );
                        StringTokenizer st = new StringTokenizer( line, " " );
                        i++;
                        t = st.countTokens( );
                // System.out.println( "^^^" + t );
                } while ( t != 0 );

                // should be done with header, now for the interesting part

                //Let's open the HBase connection...

HBaseConfiguration hc = new HBaseConfiguration( newConfiguration( ) );

                HTable ht = new HTable( hc, "yotest1" );
                // We have table handle now and can start to loop over the
                // Arc file content.

                do {
                        StringTokenizer st = new StringTokenizer( line, " " );
                        if ( st.countTokens( ) == 5  ) {
                                String url = st.nextToken( );
                                String ip = st.nextToken( );
                                String ts = st.nextToken( );
                                String mime = st.nextToken( );
                                String len = st.nextToken( );

// System.out.println( url + " " + ip + " " + ts + " " + mime + "" + len );

                                java.util.Scanner sk = new java.util.Scanner( 
len );
                                try {
                                        limit = sk.nextInt();
                                        byte[] body = new byte[limit]; 
//-line.length()];
                                        act = in.read( body, 0 , limit); 
//-line.length());
                                        // We have read the header and body now 
if it is html we
                                        // can do the clustering process and 
write the result into hbase
                                        if( mime.compareTo( "text/html" ) == 0 
) {
                                                counter++;
                                                // write it into hbase now
                                                errf = false;
                                                exno = 0;
                                                do {
                                                        try {
                                                                BatchUpdate bu 
= new BatchUpdate( url );

bu.put( "crawltime:", ts.getBytes() );bu.put( "ip:",ip.getBytes() );bu.put( "mime:",mime.getBytes() );// need to parse firstbu.put( "respcode:", body );// need to calculate firstbu.put( "offset:", etBytes() );bu.put( "size:",len.getBytes() );// bu.put( "file:",arc.getBytes() );bu.put( "resp:",body );

                                                                String clno = 
"" + counter;
                                                                bu.put( 
"clusterno:", clno.getBytes() );
                                                                ht.commit( bu );
                                                        } catch ( IOException 
aex )
                                                        {
                                                                exno++;
                                                                System.out.println( 
"IO Exception No=" + exno );
                                                                try {
                                                                        
Thread.sleep( 60000 );
                                                                } catch( 
InterruptedException ee )
                                                                {
                                                                        
System.out.println( "Time to exit..." );
                                                                        
System.exit(1);
                                                                }
                                                                hc = new 
HBaseConfiguration( new Configuration( ) );
                                                                ht = new HTable( hc, 
"yotest1" );
                                                                errf = true;
                                                        }
                                                } while ( errf );
                                        }

                                }

catch ( InputMismatchException ex ){ System.out.println( "Odd..." ); }

//`                             catch (SimpleHtmlParserException e) { 
e.printStackTrace(); }
                        }       
                } while ( ( line = rl( in ) ) != null );
                System.out.println( "Wrote " + counter + " pages " + new Date() 
);
        } catch ( IOException ex ) { ex.printStackTrace( ); }
        //      } catch (Exception e) {
        //      e.printStackTrace();
        //      }

    }

}

On Jan 17, 2009, at 3:37 PM, stack wrote:

Derek Pappas wrote:
No. See attached program. It parses the arc files and writes thehtml records to hbase.
5 data nodes and 3 regions.
I don't believe this list allows attachments (Program did not comeacross). Put it up in pastebin?
Single threaded.
How many instances do you have running?  One only?
Tell us what you are seeing in your logs so we can help. Makesure you have DEBUG enabled (see earlier in the FAQ that J-Dpointed you at for how).
Errors posted below, datanodes complaining of blocks, as J-Dindicates, should be addressed mostly by the troubleshootingsection he pointed you to. You might also check datanode logs forerrors. Could help give us a clue why the failures.
Meantime, how many regions when it fails? Tell us about yourschema and your hardware.
Dell 850's. Super Micro core duo's and a quad core.

5 data nodes 3 regions
Add your configuration to pastebin too. Whats your schema like?How many column families?
Anything else running on these systems? They should be well able(How much RAM -- are you swapping?).
Enable DEBUG and paste exceptions from regionserver logs includingthe lines that lead up to the exception.
Check your datanode logs too.

St.Ack


Best Regards,

Derek Pappas
depappas at yahoo d0t com

Re: production usage of HBase

Reply via email to