Hi, John, Thanks for offering help. Attachment is my test program. estimate_query.cpp is the source code which calls the estimate function. -part.txt is included, and bin.txt has a list of bin boundaries. In case the mail system will filter .sh file, I change the name run.sh to run.sh.rename. Please change it back if you want to run it (modify path to fastbit as well). Thanks again. Nan
> Date: Wed, 12 Jun 2013 23:48:48 -0700 > From: [email protected] > To: [email protected] > CC: [email protected] > Subject: Re: [FastBit-users] How to enable fastbit to answer the query > without touching raw data > > Hi, Nan, > > Oh, well, I am not as sure about what is going on as I was earlier > today. If you are willing to pack up the smaller test problem you > were using, I could find time to look into a little bit more.. > > John > > > On 6/12/13 7:36 PM, nan zhou wrote: > > Hi, John > > > > Much appreciate for your time and help, please forgive me if I ask > > too much. > > > > I expanded the total number of values to 20,000,000. And the > > estimate function still give min hits as 0, and max hits as 20, > > 000,000 by using same query ( details is at the end) . Could you > > explain a little bit on how the FastBit to do estimate, like how it > > calculate cost, how to decide to load the index or raw data? Maybe, I > > missed other important thing during index build. > > > > Thank you very much > > > > /./estimate_query / > > /number of records where data < 15: evaluate() = *3998670* records./ > > /number of records where data < 15: estimate() = *0* records between 0 > > and *20000000* hits./ > > / > > / > > /--------------------- Histogram ------------------------------------/ > > /0 to 10 has 1999176 elements/ > > /10 to 40 has 5999627 elements/ > > /40 to 70 has 5999200 elements/ > > /70 to 100 has 6001997 elements/ > > /100 to 1.79769e+308 has 0 elements/ > > /--------------------------------------------------------------------/ > > > > Nan > > > >> Date: Wed, 12 Jun 2013 13:22:34 -0700 > >> From: [email protected] > >> To: [email protected] > >> CC: [email protected] > >> Subject: Re: [FastBit-users] How to enable fastbit to answer the > > query without touching raw data > >> > >> When the cost of doing an estimation is high, the function > >> ibis::query::estimate gives back the default answer of 0 as min and N > >> as max (where N is the number of rows in the data partition). > >> Technically, this is a correct answer, thought it might not be what > >> you expect. > >> > >> In your case, since N = 100, which is very small, the overhead of > >> reading the index into memory or reading the header of the index into > >> memory is as high as reading all values (which takes up only 400 > >> bytes) into memory. The decision of whether the cost of high not is > >> compared to reading the raw data values. I imagine that the cost > >> decision is likely to favor reading all N values. > >> > >> Depending on what you want to do with the results of > >> ibis::query::estimate, you might have to call ibis::query::evaluate > >> instead. > >> > >> John > >> > >> > >> On 6/12/13 9:17 AM, nan zhou wrote: > >> > Hi, John, > >> > > >> > Thanks for the reply. I did retrieve the min and the max hits. But the > >> > return values is *0* for the /getMinNumHits/ and *100* for > >> > the /getMaxNumHits/. 100 is the total number of records. > >> > > >> > I am expecting the returned min hits at least is *8* and max hits at > >> > least is *15* for the query which has where clause ( data < 15 ) and > >> > data which has following distribution of each bin. > >> > > >> > Records distribution for each bin: > >> >> > value range | # of element locates in this range > >> >> > [0 - 10) | 8 > >> >> > [10 - 20) | 7 // our query touches these two bins > >> >> > [20 - 30) | 12 > >> >> > [30 - 40) | 11 > >> >> > [40 - 50) | 10 > >> >> > [50 - 60) | 9 > >> >> > [60 - 70) | 15 > >> >> > [70 - 80) | 10 > >> >> > [80 - 90) | 7 > >> >> > [90 - 100) | 11 > >> > > >> > > >> > Please see below for the codes I am using: > >> > > >> > /estimate_query.setWhereClause ("data < 15");// > >> > //estimate_query.getHitRows (RIDs);// > >> > // > >> > //uint64_t min_hits = estimate_query.getMinNumHits ();// > >> > //uint64_t max_hits = estimate_query.getMaxNumHits ();// > >> > //uint32_t estimate_size = RIDs.size (); > >> > > >> > /Output:/ > >> > //>>> where data < 15: estimate() *returned 0 records between minimum > >> > 0 and maximum 100 hits.*/ > >> > /* > >> > */ > >> > /*Thanks, */ > >> > /* > >> > */ > >> > /*Nan > >> > */ > >> >> Date: Tue, 11 Jun 2013 23:05:09 -0700 > >> >> From: [email protected] > >> >> To: [email protected] > >> >> CC: [email protected] > >> >> Subject: Re: [FastBit-users] How to enable fastbit to answer the > >> > query without touching raw data > >> >> > >> >> The documentation of ibis::query::estimate states that > >> >> > >> >> Returns 0 for success, a negative value for error. > >> >> > >> >> Since the function call was completed correctly, it should have > >> >> returned 0. To find out the minimum and maximum number of hits > >> >> determined by ibis::query::estimate, you need to call > >> >> ibis::query::getMinNumHits and ibis::query::getMaxNumHits. You can > >> >> see an example of how they are used in examples/ibis.cpp line > > 3549 and > >> >> 3550. > >> >> > >> >> John > >> >> > >> >> > >> >> On 6/11/13 2:50 PM, nan zhou wrote: > >> >> > Hello, > >> >> > > >> >> > Sorry to send this email again, I realized that the email is not > >> >> > sent to fastbit user mailing list. Following is my problem. > >> >> > > >> >> > I tried the estimate function as you instructed before, however I > >> >> > got a wrong answer from estimate function (FastBit version is > > 1.3.6). > >> >> > Could you help me ? > >> >> > > >> >> > I have data which has following distribution: > >> >> > value range | # of element locates in this range > >> >> > [0 - 10) | 8 > >> >> > [10 - 20) | 7 > >> >> > [20 - 30) | 12 > >> >> > [30 - 40) | 11 > >> >> > [40 - 50) | 10 > >> >> > [50 - 60) | 9 > >> >> > [60 - 70) | 15 > >> >> > [70 - 80) | 10 > >> >> > [80 - 90) | 7 > >> >> > [90 - 100) | 11 > >> >> > Above data was binned into 4 bins, whose boundaries are "10, 40, > >> >> > 70, 100". > >> >> > > >> >> > I applied estimate function when the query is " xxx where data > >> >> > value < 15 ", the estimate function return 0, which is not right. > >> >> > If i use evaluate function given by same query, the results number > >> >> > is 15 which is correct. > >> >> > > >> >> > Here is my code : > >> >> > > >> >> > vector <uint32_t> RIDs; > >> >> > > >> >> > ibis::part table ("test", static_cast<const char*>(0)); > >> >> > > >> >> > // create a query object with the current user name. > >> >> > ibis::query estimate_query (ibis::util::userName(), &table); > >> >> > ibis::query evaluate_query (ibis::util::userName(), &table); > >> >> > > >> >> > evaluate_query.setWhereClause ("data < 15"); > >> >> > assert (evaluate_query.evaluate () >= 0); > >> >> > evaluate_query.getHitRows (RIDs); > >> >> > > >> >> > uint32_t evaluate_size = RIDs.size (); > >> >> > > >> >> > cout << "number of records where data < 15: evaluate() = " << > >> >> > evaluate_size << " records." << endl; *// here it returns 15* > >> >> > > >> >> > RIDs.clear (); > >> >> > > >> >> > estimate_query.setWhereClause ("data < 15"); > >> >> > estimate_query.getHitRows (RIDs); > >> >> > > >> >> > uint64_t min_hits = estimate_query.getMinNumHits (); > >> >> > uint64_t max_hits = estimate_query.getMaxNumHits (); > >> >> > uint32_t estimate_size = RIDs.size (); > >> >> > > >> >> > cout << "number of records where data < 15: estimate() = " << > >> >> > estimate_size << " records between " << min_hits << " and " << > >> >> > max_hits << " hits." << endl; *// value of variable estimate_size > >> >> > is 0 , and min_hits = 0, and max_hits = 100* > >> >> > > >> >> > Any clue why it is not returning the right value? Thanks > >> >> > > >> >> > Nan > >> >> > > >> >> > > >> >> > > > ---------------------------------------------------------------------- > >> >> > From: [email protected] > >> >> > To: [email protected] > >> >> > Subject: RE: [FastBit-users] How to enable fastbit to answer > > the query > >> >> > without touching raw data > >> >> > Date: Thu, 9 May 2013 22:35:58 +0800 > >> >> > > >> >> > Thank you very much. > >> >> > > >> >> > nan > >> >> > > >> >> >> Date: Wed, 8 May 2013 14:52:31 -0700 > >> >> >> From: [email protected] > >> >> >> To: [email protected] > >> >> >> CC: [email protected] > >> >> >> Subject: Re: [FastBit-users] How to enable fastbit to answer the > >> >> > query without touching raw data > >> >> >> > >> >> >> Yes, your understanding is correct. > >> >> >> > >> >> >> John > >> >> >> > >> >> >> > >> >> >> On 5/8/13 1:38 PM, nan zhou wrote: > >> >> >> > Hi, John, > >> >> >> > > >> >> >> > Further question would be how the `estimate` function works. For > >> >> >> > example, if I have bin boundaries, such as: 0, 10 , 20, 30, > > 40, and > >> >> >> > 50 , six bin boundaries for column A( bin 1: [0, 10), bin 2: > >> > [10, 20), > >> >> >> > bin 3: [20, 30), bin 4 [30, 40), bin 5 [40, 50) ) . The where > >> > clause > >> >> >> > has 21<= A <= 35. In such as, all bit positions/RIDs in bin 3 > >> > and bin > >> >> >> > 4 are retrieved, no matter whether the actual value is in > > the query > >> >> >> > range or not. Do I understand it correctly? > >> >> >> > > >> >> >> > Thanks. > >> >> >> > > >> >> >> > nan > >> >> >> >
run.sh.rename
Description: Binary data
# metadata file written by ibis::part::writeMetaData # on Tue Jun 4 23:13:10 2013 BEGIN HEADER Name = "data" Description = "This table was created on Tue Jun 4 22:39:32 2013 with 100 rows and 1 columns." Number_of_columns = 1 Number_of_rows = 100 Timestamp = 1370401990 State = 1 index = <binning nbins=4,binFile="./test/bins.txt" /> END HEADER Begin Column name = "data" data_type = "INT" minimum = 0 maximum = 90 End Column
#include "ibis.h" // FastBit IBIS primary include file
#include "array_t.h"
#include "index.h"
#include <iostream>
#include <vector>
#include <string>
#include <cassert>
using namespace std;
int main (int argc, char *argv [])
{
vector <uint32_t> RIDs;
ibis::part table ("test", static_cast<const char*>(0));
// create a query object with the current user name.
ibis::query estimate_query (ibis::util::userName(), &table);
ibis::query evaluate_query (ibis::util::userName(), &table);
evaluate_query.setWhereClause ("data < 15");
assert (evaluate_query.evaluate () >= 0);
evaluate_query.getHitRows (RIDs);
uint32_t evaluate_size = RIDs.size ();
cout << "number of records where data < 15: evaluate() = " << evaluate_size << " records." << endl;
RIDs.clear ();
estimate_query.setWhereClause ("data < 15");
assert (estimate_query.estimate () >= 0);
estimate_query.getHitRows (RIDs);
uint64_t min_hits = estimate_query.getMinNumHits ();
uint64_t max_hits = estimate_query.getMaxNumHits ();
uint32_t estimate_size = RIDs.size ();
cout << "number of records where data < 15: estimate() = " << estimate_size << " records between " << min_hits << " and " << max_hits << " hits." << endl;
// print bin boundaries and the number of elements in each bin
const ibis::column *cptr = table.getColumn ("data");
ibis::index* idx = ibis::index::create (cptr, "test/data.idx", 0, 1);
if (idx == 0) return -2;
uint32_t numBins = idx->numBitvectors ();
vector <double> binBoundaries;
idx->binBoundaries (binBoundaries);
cout << endl << "--------------------- Histogram ------------------------------------" << endl;
for (int i = 0; i < numBins; i ++) {
const ibis::bitvector* bv = idx->getBitvector (i);
if (i == 0) {
cout << "0" << " to " << binBoundaries [i] << " has " << bv->count () << " elements" << endl;
} else {
cout << binBoundaries [i - 1] << " to " << binBoundaries [i] << " has " << bv->count () << " elements" << endl;
}
}
cout << "--------------------------------------------------------------------" << endl;
cout << endl;
return 0;
}
#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <vector>
#include <string>
#include <cassert>
#include <stdint.h>
using namespace std;
int main (int argc, char *argv [])
{
// Make sure you have a test dir
FILE *fp = fopen ("test/data", "wb");
int32_t i = 0;
uint32_t num_elements = 100;
vector <int32_t> count (100, 0);
srand (0);
for (i = 0; i < num_elements; i ++) {
int32_t val = (rand () % 10) * 10;
fwrite (&val, 1, sizeof (val), fp);
count [val] += 1;
}
cout << endl << "----------------------- Data --------------------------------------" << endl;
for (i = 0; i < num_elements; i ++) {
if (count [i]) {
cout << "Element " << i << " has " << count [i] << " values." << endl;
}
}
cout << "-------------------------------------------------------------------" << endl << endl;
fclose (fp);
return 0;
}
10 40 70 100
_______________________________________________ FastBit-users mailing list [email protected] https://hpcrdm.lbl.gov/cgi-bin/mailman/listinfo/fastbit-users
