Hi, Justo,

The existing function for processing tdlist was not able to handle embedded spaces. This has been changed in the source code of FastBit in the SVN repository. Please download the source code from the SVN repository when you get the chance. Let us know when you do get a change to try the new code.

John

PS: Attached is a small test file to generate sample data might replicate what you are testing. Please let me know if you are ok to include this program as part of the test suite.
/// $Id$
///
/// A program to generate sample data based on the Risk taxonomy used by
/// Justo Ruiz Ferrer <[email protected]>, 2010/12/02.
/// It generates a dataset with five columns named: rowid, rsk1, rsk2,
/// rsk3, riskkeys.

#include <ibis.h>       // ibis name space, FastBit IBIS functions
#include <twister.h>    // ibis::discreteZipf1, ibis::MersenneTwister
#include <iostream>
#include <iomanip>
#include <memory>

/// The number of categories.
#define NCATEGORIES 20
/// Whole category strings.  Each value contains three components.
const char* RISKKEYS[] = {
    "A     Strong                        Good       ",
    "A-    Strong                        Good       ",
    "A+    Strong                        Good       ",
    "AA    Very Strong                   Good       ",
    "AA-   Very Strong                   Good       ",
    "AA+   Very Strong                   Good       ",
    "AAA   Extremely Strong              Good       ",
    "B     More Vulnerable               Not so good",
    "B-    More Vulnerable               Not so good",
    "B+    More Vulnerable               Not so good",
    "BB    Less Vulnerable               Not so good",
    "BB-   Less Vulnerable               Not so good",
    "BB+   Less Vulnerable               Not so good",
    "BBB   Adequate                      FiftyFifty ",
    "BBB-  Adequate                      FiftyFifty ",
    "BBB+  Adequate                      FiftyFifty ",
    "C     Currently Highly Vulnerable   Run Away   ",
    "CC    Currently Highly Vulnerable   Run Away   ",
    "CCC   Currently Vulnerable          Run Away   ",
    "D     Failed                        Run Away   ",};
/// The risk score.  Component 3 of the risk category.
const char* RSK3[] = {
    "A",
    "A-",
    "A+",
    "AA",
    "AA-",
    "AA+",
    "AAA",
    "B",
    "B-",
    "B+",
    "BB",
    "BB-",
    "BB+",
    "BBB ",
    "BBB-",
    "BBB+",
    "C",
    "CC",
    "CCC",
    "D",};
/// The risk description.  Component 2 of the risk category.
const char* RSK2[] = {
    "Strong",
    "Strong",
    "Strong",
    "Very Strong",
    "Very Strong",
    "Very Strong",
    "Extremely Strong",
    "More Vulnerable",
    "More Vulnerable",
    "More Vulnerable",
    "Less Vulnerable",
    "Less Vulnerable",
    "Less Vulnerable",
    "Adequate",
    "Adequate",
    "Adequate",
    "Currently Highly Vulnerable",
    "Currently Highly Vulnerable",
    "Currently Vulnerable",
    "Failed",};
/// The risk level.  Component 1 of the risk category.
const char* RSK1[] = {
    "Good",
    "Good",
    "Good",
    "Good",
    "Good",
    "Good",
    "Good",
    "Not so good",
    "Not so good",
    "Not so good",
    "Not so good",
    "Not so good",
    "Not so good",
    "FiftyFifty",
    "FiftyFifty",
    "FiftyFifty",
    "Run Away",
    "Run Away",
    "Run Away",
    "Run Away",};

typedef std::map<const char*, std::vector<uint64_t>, ibis::lessi> TDList;

static void initColumns(ibis::tablex& tab, ibis::table::row& val) {
    tab.addColumn("rowid", ibis::UINT);
    tab.addColumn("rsk1", ibis::CATEGORY);
    tab.addColumn("rsk2", ibis::CATEGORY);
    tab.addColumn("rsk3", ibis::CATEGORY);
    tab.addColumn("riskkeys", ibis::TEXT, "concatenated risk keys",
                  "keywords, docidname=rowid");
    val.clear();
    val.uintsnames.push_back("rowid");
    val.uintsvalues.resize(1);
    val.catsnames.push_back("rsk1");
    val.catsnames.push_back("rsk2");
    val.catsnames.push_back("rsk3");
    val.catsvalues.resize(3);
    val.textsnames.push_back("riskkeys");
    val.textsvalues.resize(1);
} // initColumns

static void fillRow(uint64_t seq, ibis::table::row& val, TDList& tdl) {
    static ibis::MersenneTwister mt;
    static ibis::discreteZipf1 zipf(mt, NCATEGORIES-1);
    const unsigned long ir = zipf();
    val.uintsvalues[0] = seq;
    val.textsvalues[0] = RISKKEYS[ir];
    val.catsvalues[0] = RSK1[ir];
    val.catsvalues[1] = RSK2[ir];
    val.catsvalues[2] = RSK3[ir];
    tdl[RSK1[ir]].push_back(seq);
    tdl[RSK2[ir]].push_back(seq);
    tdl[RSK3[ir]].push_back(seq);
} // fillRow

static void writeTDList(const TDList& tdl, const char* dir) {
    std::string fname(dir);
    fname += FASTBIT_DIRSEP;
    fname += "riskkeys.tdlist";
    std::ofstream tdf(fname.c_str(), std::ios::out | std::ios::ate | 
std::ios::app);
    if (! tdf) {
        LOGGER(1)
            << "writeTDList failed to open " << fname << " for appending the 
new Term-Document entries";
        return;
    }

    for (TDList::const_iterator it = tdl.begin(); it != tdl.end(); ++ it) {
        const std::vector<uint64_t>& ids = (*it).second;
        tdf << (*it).first << ": ";
        for (size_t j = 0; j < ids.size(); ++ j)
            tdf << ' ' << ids[j];
        tdf << "\n";
    }
} // writeTDList

/// main.
int main(int argc, char** argv) {
    uint32_t maxrow=0, nrpd=0;
    int nparts, ndigits, ierr;

    // must have the output directory name
    if (argc < 2) {
        std::cerr << "\nUsage:\n" << *argv
                  << " <output-dir> [#rows [#rows-per-dir [conf-file]]]\n"
                  << "If the 4th argument is not provided, the number of "
            "rows per directory will be determined by the memory cache size, "
            "which is by default 1/2 of the physical memory size.\n"
                  << std::endl;
        return -1;
    }

    // initialize the file manage with the 5th argument
    ibis::init(argc>4 ? argv[4] : (const char*)0);
    ibis::util::timer mytimer(*argv, 0);
    if (argc > 2) // user specified maxrow
        maxrow = atof(argv[2]);
    if (maxrow <= 0) {
        maxrow = ibis::fileManager::currentCacheSize();
        maxrow = ibis::util::compactValue(maxrow / 120.0, maxrow / 80.0);
        nrpd = maxrow;
    }
    if (maxrow < 10)
        maxrow = 10;
    if (argc > 3) // user specified nrpd
        nrpd = atof(argv[3]);
    if (nrpd <= 0) {
        nrpd = ibis::fileManager::currentCacheSize();
        nrpd = ibis::util::compactValue(nrpd / 120.0, nrpd / 80.0);
    }
    if (nrpd > maxrow) nrpd = maxrow;

    ibis::table::row val;
    std::auto_ptr<ibis::tablex> tab(ibis::tablex::create());
    initColumns(*tab, val);
    tab->reserveSpace(nrpd);
    if (tab->capacity() < nrpd)
        nrpd = tab->capacity();
    LOGGER(1) << *argv << ' ' << argv[1] << ' ' << maxrow << ' ' << nrpd
              << std::endl;
    nparts = maxrow / nrpd;
    nparts += (maxrow > nparts*nrpd);
    ierr = nparts;
    for (ndigits = 1, ierr >>= 4; ierr > 0; ierr >>= 4, ++ ndigits);
    for (uint32_t irow = 1; irow <= maxrow;) {
        const uint32_t end = irow - 1 + nrpd;
        TDList tdl;
        std::string dir = argv[1];
        if (nparts > 1) { // figure out the directory name
            const char* str = strrchr(argv[1], FASTBIT_DIRSEP);
            if (str != 0) {
                if (str[1] == 0) {
                    while (str-1 > argv[1]) {
                        if (*(str-1) == FASTBIT_DIRSEP) break;
                        else -- str;
                    }
                }
                else {
                    ++ str;
                }
            }
            std::ostringstream oss;
            oss << FASTBIT_DIRSEP << (str ? str : "_") << std::hex
                << std::setprecision(ndigits) << std::setw(ndigits)
                << std::setfill('0') << irow / nrpd;
            dir += oss.str();
        }

        for (; irow <= end; ++ irow) {
            fillRow(irow, val, tdl);
            ierr = tab->appendRow(val);
            LOGGER(ierr != 5)
                << "Warning -- " << *argv << " failed to append row " << irow
                << " to the in-memory table, appendRow returned " << ierr;
        }
        ierr = tab->write(dir.c_str());
        LOGGER(ierr < 0)
            << "Warning -- " << *argv << " failed to write " << tab->mRows()
            << " rows to " << dir << ", ibis::tablex::write returned " << ierr;
        writeTDList(tdl, dir.c_str());
        tab->clearData();
        tdl.clear();
    }
    return 0;
} // main
_______________________________________________
FastBit-users mailing list
[email protected]
https://hpcrdm.lbl.gov/cgi-bin/mailman/listinfo/fastbit-users

Reply via email to