//package com.yoterra.se.afp;

import java.io.IOException;

//
// Sample Hbase data importer
// reads from a file called cdr.data and injects into the cdrs table reporting the date / time every 1000 commits
//   by   Anders Brownworth
import java.io.*;
import java.util.*;
//import org.apache.log4j.BasicConfigurator;
//import org.apache.log4j.Level;
//import org.apache.log4j.Logger;

import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.io.*;
import org.apache.hadoop.hbase.client.*;

public class ArcParserImporter {
        
    private static String rl( FileInputStream in ) {
                try {
                int i;
                char c[] = new char[1];
                String s = new String( "" );
                while ( ( i = in.read( )) != 10 )
                    if( i == -1 ) return null;
                    else {
                        c[0] = (char) i;
                        s = s + new String( c );
                    }
                //    System.out.println( i + "=" + s );
                //c[0] = (char) i;
                //s = s + new String( c );
                return s;

                }
                catch ( IOException ex ) {
                 ex.printStackTrace( );
                return null;
                }
        }

    public static void main( String args[] ) throws IOException {
        System.out.print( "starting " + new Date() + "..." );

        try {

                int counter = 0;
                int act =0;
                int limit=0;
                int exno=0;
                boolean errf=false;

                if (args.length == 0) {
System.out.println("[ERROR] exiting program - no arc file passed as arg");
                    System.exit(1);
                }

                FileInputStream in = new FileInputStream( args[0] ) ;
                String line;
        
                // read header and forget about it
                // first line has 5 token, the one before 0
                // System.out.println( "Header ... " );
                int i = 0;
                int t = 0;
                char[] c = new char[1];
                do {
                        line = rl( in  );
                        StringTokenizer st = new StringTokenizer( line, " " );
                        i++;
                        t = st.countTokens( );
                // System.out.println( "^^^" + t );
                } while ( t != 0 );

                // should be done with header, now for the interesting part

                //Let's open the HBase connection...
HBaseConfiguration hc = new HBaseConfiguration( new Configuration( ) );
                HTable ht = new HTable( hc, "yotest1" );
                // We have table handle now and can start to loop over the
                // Arc file content.

                do {
                        StringTokenizer st = new StringTokenizer( line, " " );
                        if ( st.countTokens( ) == 5  ) {
                                String url = st.nextToken( );
                                String ip = st.nextToken( );
                                String ts = st.nextToken( );
                                String mime = st.nextToken( );
                                String len = st.nextToken( );
// System.out.println( url + " " + ip + " " + ts + " " + mime + " " + len );
                                java.util.Scanner sk = new java.util.Scanner( 
len );
                                try {
                                        limit = sk.nextInt();
                                        byte[] body = new byte[limit]; 
//-line.length()];
                                        act = in.read( body, 0 , limit); 
//-line.length());
                                        // We have read the header and body now 
if it is html we
                                        // can do the clustering process and 
write the result into hbase
                                        if( mime.compareTo( "text/html" ) == 0 
) {
                                                counter++;
                                                // write it into hbase now
                                                errf = false;
                                                exno = 0;
                                                do {
                                                        try {
                                                                BatchUpdate bu 
= new BatchUpdate( url );
bu.put( "crawltime:", ts.getBytes() ); bu.put( "ip:", ip.getBytes() ); bu.put( "mime:", mime.getBytes() ); // need to parse first bu.put( "respcode:", body ); // need to calculate first bu.put( "offset:", etBytes() ); bu.put( "size:", len.getBytes() ); // bu.put( "file:", arc.getBytes() ); bu.put( "resp:", body );
                                                                String clno = 
"" + counter;
                                                                bu.put( 
"clusterno:", clno.getBytes() );
                                                                ht.commit( bu );
                                                        } catch ( IOException 
aex )
                                                        {
                                                                exno++;
                                                                System.out.println( 
"IO Exception No=" + exno );
                                                                try {
                                                                        
Thread.sleep( 60000 );
                                                                } catch( 
InterruptedException ee )
                                                                {
                                                                        
System.out.println( "Time to exit..." );
                                                                        
System.exit(1);
                                                                }
                                                                hc = new 
HBaseConfiguration( new Configuration( ) );
                                                                ht = new HTable( hc, 
"yotest1" );
                                                                errf = true;
                                                        }
                                                } while ( errf );
                                        }

                                }                       
catch ( InputMismatchException ex ) { System.out.println( "Odd..." ); }
//`                             catch (SimpleHtmlParserException e) { 
e.printStackTrace(); }
                        }       
                } while ( ( line = rl( in ) ) != null );
                System.out.println( "Wrote " + counter + " pages " + new Date() 
);
        } catch ( IOException ex ) { ex.printStackTrace( ); }
        //      } catch (Exception e) {
        //      e.printStackTrace();
        //      }

    }

}

On Jan 17, 2009, at 3:37 PM, stack wrote:

Derek Pappas wrote:
No. See attached program. It parses the arc files and writes the html records to hbase.
5 data nodes and 3 regions.

I don't believe this list allows attachments (Program did not come across). Put it up in pastebin?


Single threaded.

How many instances do you have running?  One only?


Tell us what you are seeing in your logs so we can help. Make sure you have DEBUG enabled (see earlier in the FAQ that J-D pointed you at for how).

Errors posted below, datanodes complaining of blocks, as J-D indicates, should be addressed mostly by the troubleshooting section he pointed you to. You might also check datanode logs for errors. Could help give us a clue why the failures.

Meantime, how many regions when it fails? Tell us about your schema and your hardware.

Dell 850's. Super Micro core duo's and a quad core.

5 data nodes 3 regions

Add your configuration to pastebin too. Whats your schema like? How many column families?

Anything else running on these systems? They should be well able (How much RAM -- are you swapping?).

Enable DEBUG and paste exceptions from regionserver logs including the lines that lead up to the exception.

Check your datanode logs too.

St.Ack

Best Regards,

Derek Pappas
depappas at yahoo d0t com




Reply via email to