[protobuf] Re: suggestions on improving the performance?

alok Sun, 15 Jan 2012 16:41:24 -0800

Daniel,

i am hoping that my code is incorrect but i am not sure what is wrong
or what is really causing this slowness.


@ Henner Zeller, sorry i forgot to include the object length in above
example. I do store object length for each object. I dont have issues
in reading all the objects. Code is working fine. I just want to make
sure to be able to make the code run faster now.

attaching my code here...

File format is

File header
Record1, Record2, Record3

Each record contains n objects of type defined in proto file. 1st
object has header which contains the number of objects in each record.

<code>
proto file

message HeaderMessage {
        required double timestamp = 1;
  required string ric_code = 2;
  required int32 count = 3;
  required int32 total_message_size = 4;
}

message QuoteMessage {
        enum Side {
    ASK = 0;
    BID = 1;
  }
  required Side type = 1;
        required int32 level = 2;
        optional double price = 3;
        optional int64 size = 4;
        optional int32 count = 5;
        optional HeaderMessage header = 6;
}

message CustomMessage {
        required string field_name = 1;
        required double value = 2;
        optional HeaderMessage header = 3;
}

message TradeMessage {
        optional double price = 1;
        optional int64 size = 2;
        optional int64 AccumulatedVolume = 3;
        optional HeaderMessage header = 4;
}

message AlphaMessage {
        required int32 level = 1;
        required double alpha = 2;
        optional double stddev = 3;
        optional HeaderMessage header = 4;
}

</code>

<code>
Reading records from binary file

bool ReadNextRecord(CodedInputStream *coded_input,
stdext::hash_set<std::string> instruments)
{
        uint32 count, objtype, objlen;
        int i;

        int objectsread = 0;
        HeaderMessage *hMsg = NULL;
        TradeMessage tMsg;
        QuoteMessage qMsg;
        CustomMessage cMsg;
        AlphaMessage aMsg;

        while(1)
        {
                if(!coded_input->ReadLittleEndian32(&objtype)) {
                        return false;
                }
                if(!coded_input->ReadLittleEndian32(&objlen)) {
                        return false;
                }
                CodedInputStream::Limit lim = coded_input->PushLimit(objlen);

                switch(objtype)
                {
                case 2:
                        qMsg.ParseFromCodedStream(coded_input);
                        if(qMsg.has_header())
                        {
                                //hMsg =
                                hMsg = new HeaderMessage();
                                hMsg->Clear();
                                hMsg->Swap(qMsg.mutable_header());
                        }
                        objectsread++;
                        break;

                case 3:
                        tMsg.ParseFromCodedStream(coded_input);
                        if(tMsg.has_header())
                        {
                                //hMsg = tMsg.mutable_header();
                                hMsg = new HeaderMessage();
                                hMsg->Clear();
                                hMsg->Swap(tMsg.mutable_header());
                        }
                        objectsread++;
                        break;

                case 4:
                        aMsg.ParseFromCodedStream(coded_input);
                        if(aMsg.has_header())
                        {
                                //hMsg = aMsg.mutable_header();
                                hMsg = new HeaderMessage();
                                hMsg->Clear();
                                hMsg->Swap(aMsg.mutable_header());
                        }
                        objectsread++;
                        break;

                case 5:
                        cMsg.ParseFromCodedStream(coded_input);
                        if(cMsg.has_header())
                        {
                                //hMsg = cMsg.mutable_header();
                                hMsg = new HeaderMessage();
                                hMsg->Clear();
                                hMsg->Swap(cMsg.mutable_header());
                        }
                        objectsread++;
                        break;

                default:
                        cout << "Invalid object type "<< objtype <<
endl;
                        return false;
                        break;
                }
                coded_input->PopLimit(lim);
                if(objectsread == hMsg->count()) break;
        }
        return true;
}


void ReadAllMessages(ZeroCopyInputStream *raw_input,
stdext::hash_set<std::string> instruments)
{
        int item_count = 0;
        while(1)
        {
                CodedInputStream in(raw_input);
                if(!ReadNextRecord(&in, instruments))
                        break;
                item_count++;
        }
        cout << "Finished reading file. Total "<<item_count<<" items
read."<<endl;
}


int _tmain(int argc, _TCHAR* argv[])
{
        GOOGLE_PROTOBUF_VERIFY_VERSION;

        ZeroCopyInputStream *raw_input;
        CodedInputStream *coded_input;
        stdext::hash_set<std::string> instruments;

        string filename = "S:/users/aaj/sandbox/tickdata/bin/hk/
2011/2011.01.04.bin";
        int fd = _open(filename.c_str(), _O_BINARY | O_RDONLY);

        if( fd == -1 )
        {
                printf( "Error opening the file. \n" );
                exit( 1 );
        }

        raw_input = new FileInputStream(fd);
        coded_input = new CodedInputStream(raw_input);

        uint32 magic_no;

        coded_input->ReadLittleEndian32(&magic_no);

        cout << "HEADER: " << "\t" << magic_no<<endl;
        cout << "Reading data objects.." << endl;
        delete coded_input;
        cout << td << '\n';

        ReadAllMessages(raw_input, instruments);

        cout << td << '\n';

        delete raw_input;
        _close(fd);
        google::protobuf::ShutdownProtobufLibrary();

        return 0;
}

</code>


On Jan 14, 3:37 am, Henner Zeller <henner.zel...@googlemail.com>
wrote:
> On Fri, Jan 13, 2012 at 11:22, Daniel Wright <dwri...@google.com> wrote:
> > It's extremely unlikely that text parsing is faster than binary parsing on
> > pretty much any message.  My guess is that there's something wrong in the
> > way you're reading the binary file -- e.g. no buffering, or possibly a bug
> > where you hand the protobuf library multiple messages concatenated together.
>
> In particular, the
>    object type, object, object type object ..
> doesn't seem to include headers that describe the length of the
> following message, but such a separator is needed.
> (http://code.google.com/apis/protocolbuffers/docs/techniques.html#stre...)
>
>
>
>
>
>
>
> >  It'd be easier to comment if you post the code.
>
> > Cheers
> > Daniel
>
> > On Fri, Jan 13, 2012 at 1:22 AM, alok <alok.jad...@gmail.com> wrote:
>
> >> any suggestions? experiences?
>
> >> regards,
> >> Alok
>
> >> On Jan 11, 1:16 pm, alok <alok.jad...@gmail.com> wrote:
> >> > my point is ..should i have one message something like
>
> >> > Message Record{
> >> >   required HeaderMessage header;
> >> >   optional TradeMessage trade;
> >> >   repeated QuoteMessage quotes; // 0 or more
> >> >   repeated CustomMessage customs; // 0 or more
>
> >> > }
>
> >> > or rather should i keep my file plain as
> >> > object type, object, objecttype, object
> >> > without worrying about the concept of a record.
>
> >> > Each message in file is usually header + any 1 type of message (trade,
> >> > quote or custom) ..  and mostly only 1 quote or custom message not
> >> > more.
>
> >> > what would be faster to decode?
>
> >> > Regards,
> >> > Alok
>
> >> > On Jan 11, 12:41 pm, alok <alok.jad...@gmail.com> wrote:
>
> >> > > Hi everyone,
>
> >> > > My program is taking more time to read binary files than the text
> >> > > files. I think the issue is with the structure of the binary files
> >> > > that i have designed. (Or could it be possible that binary decoding is
> >> > > slower than text files parsing? ).
>
> >> > > Data file is a large text file with 1 record per row. upto 1.2 GB.
> >> > > Binary file is around 900 MB.
>
> >> > > **
> >> > >  - Text file reading takes 3 minutes to read the file.
> >> > >  - Binary file reading takes 5 minutes.
>
> >> > > I saw a very strange behavior.
> >> > >  - Just to see how long it takes to skim through binary file, i
> >> > > started reading header on each message which holds the length of the
> >> > > message and then skipped that many bytes using the Skip() function of
> >> > > coded_input object. After making this change, i was expecting that
> >> > > reading through file should take less time, but it took more than 10
> >> > > minutes. Is skipping not same as adding n bytes to the file pointer?
> >> > > is it slower to skip the object than read it?
>
> >> > > Are their any guidelines on how the structure should be designed to
> >> > > get the best performance?
>
> >> > > My current structure looks as below
>
> >> > > message HeaderMessage {
> >> > >   required double timestamp = 1;
> >> > >   required string ric_code = 2;
> >> > >   required int32 count = 3;
> >> > >   required int32 total_message_size = 4;
>
> >> > > }
>
> >> > > message QuoteMessage {
> >> > >         enum Side {
> >> > >     ASK = 0;
> >> > >     BID = 1;
> >> > >   }
> >> > >   required Side type = 1;
> >> > >         required int32 level = 2;
> >> > >         optional double price = 3;
> >> > >         optional int64 size = 4;
> >> > >         optional int32 count = 5;
> >> > >         optional HeaderMessage header = 6;
>
> >> > > }
>
> >> > > message CustomMessage {
> >> > >         required string field_name = 1;
> >> > >         required double value = 2;
> >> > >         optional HeaderMessage header = 3;
>
> >> > > }
>
> >> > > message TradeMessage {
> >> > >         optional double price = 1;
> >> > >         optional int64 size = 2;
> >> > >         optional int64 AccumulatedVolume = 3;
> >> > >         optional HeaderMessage header = 4;
>
> >> > > }
>
> >> > > Binary file format is
> >> > > object type, object, object type object ...
>
> >> > > 1st object of a record holds header with n number of objects in that
> >> > > record. next n-1 objects will not hold header since they all belong to
> >> > > same record (same update time).
> >> > > now n+1th object belongs to the new record and it will hold header for
> >> > > next record.
>
> >> > > Any advices?
>
> >> > > Regards,
> >> > > Alok
>
> >> --
> >> You received this message because you are subscribed to the Google Groups
> >> "Protocol Buffers" group.
> >> To post to this group, send email to protobuf@googlegroups.com.
> >> To unsubscribe from this group, send email to
> >> protobuf+unsubscr...@googlegroups.com.
> >> For more options, visit this group at
> >>http://groups.google.com/group/protobuf?hl=en.
>
> > --
> > You received this message because you are subscribed to the Google Groups
> > "Protocol Buffers" group.
> > To post to this group, send email to protobuf@googlegroups.com.
> > To unsubscribe from this group, send email to
> > protobuf+unsubscr...@googlegroups.com.
> > For more options, visit this group at
> >http://groups.google.com/group/protobuf?hl=en.

-- 
You received this message because you are subscribed to the Google Groups 
"Protocol Buffers" group.
To post to this group, send email to protobuf@googlegroups.com.
To unsubscribe from this group, send email to 
protobuf+unsubscr...@googlegroups.com.
For more options, visit this group at 
http://groups.google.com/group/protobuf?hl=en.

[protobuf] Re: suggestions on improving the performance?

Reply via email to