Hi all,

On many requests I have created an osmparser.c that instead on real files exclusively operates on stdin. Why would this benefit you? Most multiprocessor systems do nothing with their second processor, so in this case one processor can do bzip2 -d -c theplanet.osm.bz and the other one can run osmsucker.c. This gives you a parser with a relative easy to read output mechanism to convert the planet to a more suitable format for bulk loading. Theoretically it could be faster, practically I didn't see the difference (yet). By default it uses my (old) normalised OSM schema.

Because I have designed a new storage model; newer version will probably include this model.
http://repo.or.cz/w/handlerosm.git?a=blob;f=osmsucker.c;hb=HEAD


I was informed on the controversial meaning of the word suck in relation to unix pipes. Never the less, the commasex^Hparatedvalues cu^Homing out of this program are pretty harmless. To prevent viral infections I have licensed the code under STD, STefan Delicensed, it allows you to do anything with the code except; make war.


Stefan



#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>
#include <errno.h>
#include <math.h>
#include <unistd.h>

/*
 * <osm>
 *  <node>
 *   <tag k=".." v=".." />
 *  </node>
 *  <way>
 *   <nd>
 *   <tag>
 *  </way>
 *  <relation>
 *   <member>
 *   <tag>
 *  </relation>
 */

#define nextline free(start); start = NULL; tmp = getline(&start, &tmplen, 
stdin); if (tmp == -1) { goto exit; }
//#define nextline start = end + 1


#define file_nodes                      "nodes.csv"

#ifdef BENCHMARK
#define file_nodes_uint                 "nodes_uint.csv"
#define file_nodes_gis                  "nodes_gis.csv"
#endif

#define file_node_tags                  "node_tags.csv"
#define file_ways                       "ways.csv"
#define file_way_tags                   "way_tags.csv"
#define file_way_nds                    "way_nds.csv"
#define file_relations                  "relations.csv"
#define file_relation_tags              "relation_tags.csv"
#define file_relation_member_node       "relation_member_node.csv"
#define file_relation_member_relation   "relation_member_relation.csv"
#define file_relation_member_way        "relation_member_way.csv"

unsigned int coordtouint(char *input) {
        double maxbit = (double) 4294967296.0 / (double) 360.0;
        double proper = strtod(input, NULL) * maxbit;
        return (unsigned int) proper;
}

char * escape_string(char *instr)
{
        unsigned int i, j=0, need = 0;
        unsigned int len = strlen(instr);
        char *outstr;
        
        for (i=0;i<len;i++)
                if (instr[i]=='\\' || instr[i]=='\'') need++;

        len += need;
        outstr = malloc(len + 1);
 
        for (i=0;i<=strlen(instr);i++) {
              if (instr[i]=='\\' || instr[i]=='\'') 
               outstr[j++]='\\';
              outstr[j++]=instr[i];
    }
    return outstr;
}


static void parser() {
    typedef enum { OSM = 0, NODE = 1, WAY = 2, RELATION = 3, TAG = 4, ND = 5, 
MEMBER = 6 } osm_state_t;
    typedef enum { UNKNOWN = 0, ID, LAT, LON, USER, UID, TIMESTAMP, KEY, VALUE, 
TYPE, REF, ROLE} key_state_t;
    char *attr_id = NULL, *attr_lat = NULL, *attr_lon = NULL, *attr_user = 
NULL, *attr_uid = NULL, *attr_timestamp = NULL, *attr_key = NULL, *attr_value = 
NULL,
         *attr_type = NULL, *attr_ref = NULL, *attr_role = NULL;

    unsigned int attr_lat_uint = 0;
    unsigned int attr_lon_uint = 0;

    FILE *fd_nodes = fopen(file_nodes, "w");
    if (fd_nodes == NULL) { perror("Open:"); exit(-1); }
#ifdef BENCHMARK
    FILE *fd_nodes_uint = fopen(file_nodes_uint, "w");
    if (fd_nodes_uint == NULL) { perror("Open:"); exit(-1); }
    FILE *fd_nodes_gis = fopen(file_nodes_gis, "w");
    if (fd_nodes_gis == NULL) { perror("Open:"); exit(-1); }
#endif
    FILE *fd_node_tags = fopen(file_node_tags, "w");
    if (fd_node_tags == NULL) { perror("Open:"); exit(-1); }
    FILE *fd_ways = fopen(file_ways, "w");
    if (fd_ways == NULL) { perror("Open:"); exit(-1); }
    FILE *fd_way_tags = fopen(file_way_tags, "w");
    if (fd_way_tags == NULL) { perror("Open:"); exit(-1); }
    FILE *fd_way_nds = fopen(file_way_nds, "w"); 
    if (fd_way_nds == NULL) { perror("Open:"); exit(-1); }
    FILE *fd_relations = fopen(file_relations, "w");
    if (fd_relations == NULL) { perror("Open:"); exit(-1); }
    FILE *fd_relation_tags = fopen(file_relation_tags, "w");
    if (fd_relation_tags == NULL) { perror("Open:"); exit(-1); }
    FILE *fd_members_node = fopen(file_relation_member_node, "w");
    if (fd_members_node == NULL) { perror("Open:"); exit(-1); }
    FILE *fd_members_relation = fopen(file_relation_member_relation, "w");
    if (fd_members_relation == NULL) { perror("Open:"); exit(-1); }
    FILE *fd_members_way = fopen(file_relation_member_way, "w");
    if (fd_members_way == NULL) { perror("Open:"); exit(-1); }

    unsigned long int count_nodes = 0, count_node_tags = 0,
    count_ways = 0, count_way_tags = 0, count_way_nds = 0,
    count_relations = 0, count_relation_tags = 0, count_members_node = 0, 
count_members_relation = 0, count_members_way = 0;

    unsigned long int sequence = 0;


    osm_state_t current_tag = OSM;
    osm_state_t parent_tag = OSM;

    char *start = NULL, *end, *nodename, *nodename_end;
    ssize_t tmp;
    size_t tmplen = 0;

    nextline;
    end = strchrnul((const char*) start, '\n');

    if (strncmp(start, "<?xml", 5) != 0)
        return;

    nextline;
    end = strchrnul((const char*) start, '\n');

    if (strncmp(start, "<osm", 4) != 0)
        return;

    nextline;

    do {
    end = strchrnul((const char*) start, '\n');

    nodename = strchrnul(start, '<') + 1;
    nodename_end = strchrnul(nodename, ' ');

    if (nodename[0] == '/') {
        free(attr_id);
        free(attr_lat);
        free(attr_lon);
        free(attr_timestamp);
        free(attr_user);
        free(attr_uid);
        
        attr_id = attr_lat = attr_lon = attr_user = attr_uid = attr_timestamp = 
NULL;

        sequence = 0;

        nextline;
        continue;
    }

    switch (nodename_end - nodename) {
        case 2:
            current_tag = ND;
            break;
        case 3: {
                    switch (nodename[0]) {
                        case 'o':
                            current_tag = OSM;  
                            break;
                        case 'w':
                            current_tag = WAY;
                            break;
                        case 't':
                            current_tag = TAG;
                            break;
                        default:
                            fprintf(stderr, "--> %c%c", nodename[0], 
nodename[1]);
                    }
                    break;
                }
        case 4:
                current_tag = NODE;
                break;
        case 5:
                nextline;
                continue;
        case 6:
                current_tag = MEMBER;
                break;
        case 8:
                current_tag = RELATION;
                break;
        default:
                fprintf(stderr, "--> %c%c", nodename[0], nodename[1]);
    }


    char *key, *key_end, *value_end;
    key = nodename_end + 1;

    do {
        char *value;
        key_state_t current_key = UNKNOWN;
        key_end = strchrnul(key, '=');

        if (key_end == NULL || key_end >= end)
                break;

        switch (key_end - key) {
            case 1: {
                    switch (key[0]) {
                        case 'k':
                            current_key = KEY;
                            break;
                        case 'v':
                            current_key = VALUE;
                            break;
                        default:
                            current_key = UNKNOWN;
                    }
                    break;
            }   
            case 2:
                current_key = ID;
                break;
            case 3: {
                        switch (key[1]) {
                            case 'a':
                                current_key = LAT;
                                break;
                            case 'o':
                                current_key = LON;
                                break;
                            case 'e':
                                current_key = REF;
                                break;
                            case 'i':
                                current_key = UID;
                                break;
                            default:
                                current_key = UNKNOWN;
                                fprintf(stderr, "--> %c%c\n", key[0], key[1]);
                        }
                        break;
                    }
            case 4: {
                        switch (key[0]) {
                            case 'u':
                                current_key = USER;
                                break;
                            case 'r':
                                current_key = ROLE;
                                break;
                            case 't':
                                current_key = TYPE;
                                break;
                            default:
                                current_key = UNKNOWN;
                                fprintf(stderr, "--> %c%c\n", key[0], key[1]);
                        }
                        break;
                }
            case 9:
                    current_key = TIMESTAMP;
                    break;
            default: {
                    char *thingie = strndup(key, (key_end - key));
                    current_key = UNKNOWN;
                    
                    fprintf(stderr, "UNKNOWN ATTR %s-> %c%c\n", thingie, 
key[0], key[1]);
                    free(thingie);
                }
        }

        value = key_end + 2;
        value_end = value;
        value_end = strchr(value_end, '"');

        if (value_end > end)
                break;

        switch (current_key) {
            case ID:
                if (attr_id) free(attr_id);
                attr_id = strndup(value, (value_end - value));
                break;

            case LAT:
                if (attr_lat) free(attr_lat);
                attr_lat = strndup(value, (value_end - value));
                attr_lat_uint = coordtouint(attr_lat);
                break;

            case LON:
                if (attr_lon) free(attr_lon);
                attr_lon = strndup(value, (value_end - value));
                attr_lon_uint = coordtouint(attr_lon);
                break;

            case TIMESTAMP:
                if (attr_timestamp) free(attr_timestamp);
//              attr_timestamp = strndup(value, (value_end - value));
                attr_timestamp = strndup(value, (value_end - (value + 1))); /* 
another stupid fix */
//              attr_timestamp[10] = ' '; /* Stupid timestamp fix */
                break;

            case USER: {
                char *tmp;
                if (attr_user) free(attr_user);
                attr_user = strndup(value, (value_end - value));
                tmp = escape_string(attr_user);
                free(attr_user);
                attr_user = tmp;
                break;
            }
            
            case UID: {
                if (attr_uid) free(attr_uid);
                attr_uid = strndup(value, (value_end - value));
                break;
            }

            case KEY: {
                char *tmp;
                if (attr_key) free(attr_key);
                attr_key = strndup(value, (value_end - value));
                tmp = escape_string(attr_key);
                free(attr_key);
                attr_key = tmp;
                break;
            }
            
            case VALUE: {
                char *tmp;
                if (attr_value) free(attr_value);
                attr_value = strndup(value, (value_end - value));
                tmp = escape_string(attr_value);
                free(attr_value);
                attr_value = tmp;
                break;
            }

            case TYPE:
                if (attr_type) free(attr_type);
                attr_type = strndup(value, (value_end - value));
                break;

            case REF:
                if (attr_ref) free(attr_ref);
                attr_ref = strndup(value, (value_end - value));
                break;

            case ROLE: {
                char *tmp;
                if (attr_role) free(attr_role);
                attr_role = strndup(value, (value_end - value));
                tmp = escape_string(attr_role);
                free(attr_role);
                attr_role = tmp;
                break;
            }

            default:
                fprintf(stderr, "--> %c%c\n", value[0], value[1]);
        }

        key = value_end + 2;
    } while (key < end);

    switch (current_tag) {
        case NODE:
            fprintf(fd_nodes, "%s, %s, %s, '%s', '%s'\n", attr_id, attr_lat, 
attr_lon, attr_uid, attr_timestamp);
#ifdef BENCHMARK
            fprintf(fd_nodes_uint, "%s, %d, %d, '%s', '%s'\n", attr_id, 
attr_lat_uint, attr_lon_uint, attr_uid, attr_timestamp);
            fprintf(fd_nodes_gis, "%s, 'POINT( %s %s )', '%s', '%s'\n", 
attr_id, attr_lon, attr_lat, attr_uid, attr_timestamp);
#endif
            count_nodes++;
            break;
        case TAG: {
                switch (parent_tag) {
                        case NODE:
                            fprintf(fd_node_tags, "%s, '%s', '%s'\n", attr_id, 
attr_key, attr_value);
                            count_node_tags++;
                            break;
                        case WAY:
                            fprintf(fd_way_tags, "%s, '%s', '%s'\n", attr_id, 
attr_key, attr_value);
                            count_way_tags++;
                            break;
                        case RELATION:
                            fprintf(fd_relation_tags, "%s, '%s', '%s'\n", 
attr_id, attr_key, attr_value);
                            count_relation_tags++;
                            break;
                        default:
                            break;
                }
                break;
        }
        case WAY:
            fprintf(fd_ways, "%s, '%s', '%s'\n", attr_id, attr_uid, 
attr_timestamp);
            count_ways++;
//          fprintf(fd_way_tags, "%s, '%s', '%s'\n", attr_id, "type", "way");
//          count_way_tags++;
            break;
        case RELATION:
            fprintf(fd_relations, "%s, '%s', '%s'\n", attr_id, attr_uid, 
attr_timestamp);
            count_relations++;
            break;
        case MEMBER:
                if (strcmp(attr_type, "node") == 0) {
                        fprintf(fd_members_node, "%s, %lu, %s, '%s'\n", 
attr_id, sequence, attr_ref, attr_role);
                        count_members_node++;
                } else if (strcmp(attr_type, "way") == 0) {
                        fprintf(fd_members_way, "%s, %lu, %s, '%s'\n", attr_id, 
sequence, attr_ref, attr_role);
                        count_members_way++;
                } else if (strcmp(attr_type, "relation") == 0) {
                        fprintf(fd_members_relation, "%s, %lu, %s, '%s'\n", 
attr_id, sequence, attr_ref, attr_role);
                        count_members_relation++;
                }
                sequence++;
                break;
        case ND:
            fprintf(fd_way_nds, "%s, %lu, %s\n", attr_id, sequence, attr_ref);
            sequence++;
            count_way_nds++;
            break;
        default:
            break;
    }
    
    if (end[-2] == '/') {
        switch (current_tag) {
            case NODE:
                free(attr_lat);
                free(attr_lon);
                attr_lat = NULL;
                attr_lon = NULL;
                attr_lat_uint = 0;
                attr_lon_uint = 0;
                /* no break! */

            case WAY:
            case RELATION:
                free(attr_id);
                free(attr_timestamp);
                free(attr_user);
                free(attr_uid);

                attr_id = attr_user = attr_uid = attr_timestamp = NULL;

                sequence = 0;
                break;

            case TAG:
                free(attr_key);
                free(attr_value);

                attr_key = NULL;
                attr_value = NULL;
                break;
            
            case ND:
            case MEMBER:
                free(attr_type);
                free(attr_ref);
                free(attr_role);

                attr_type = NULL;
                attr_ref = NULL;
                attr_role = NULL;
            default:
                break;
        }
    } else if (current_tag == NODE || current_tag == WAY || current_tag == 
RELATION) {
                parent_tag = current_tag;
        }

    
     nextline;  
//  } while ((start = ++end) < (range + max));
    } while (1);
exit:
        
    free(attr_id);
    free(attr_lat);
    free(attr_lon);
    free(attr_timestamp);
    free(attr_user);
    free(attr_uid);

    free(attr_key);
    free(attr_value);

    fclose(fd_nodes);
#ifdef BENCHMARK
    fclose(fd_nodes_uint);
    fclose(fd_nodes_gis);
#endif
    fclose(fd_node_tags);
    fclose(fd_ways);
    fclose(fd_way_tags);
    fclose(fd_way_nds);
    fclose(fd_relations);
    fclose(fd_relation_tags);
    fclose(fd_members_node);
    fclose(fd_members_relation);

    char *current = get_current_dir_name();

    printf("START TRANSACTION;\n");

    printf("CREATE TABLE nodes_legacy (id integer, long double, lat double, uid 
long, timestamp timestamptz);\n");
#ifdef BENCHMARK
    printf("CREATE TABLE nodes_legacy_uint (id integer, long integer, lat 
integer, uid long, timestamp timestamptz);\n");
    printf("CREATE TABLE nodes_legacy_gis (id integer, poi point, uid long, 
timestamp timestamptz);\n");
#endif
    printf("CREATE TABLE node_tags (node integer, k varchar(255), v 
varchar(1024));\n");
    printf("CREATE TABLE ways (id integer,uid long, timestamp timestamptz);\n");
    printf("CREATE TABLE way_tags (way integer, k varchar(255), v 
varchar(1024));\n");
    printf("CREATE TABLE way_nds (way integer, idx integer, to_node 
integer);\n");
    printf("CREATE TABLE relations(id integer, uid long, timestamp 
timestamptz);\n");
    printf("CREATE TABLE relation_members_node (relation integer, idx integer, 
to_node integer, role varchar(255));\n");
    printf("CREATE TABLE relation_members_relation (relation integer, idx 
integer, to_relation integer, role varchar(255));\n");
    printf("CREATE TABLE relation_members_way (relation integer, idx integer, 
to_way integer, role varchar(255));\n");
    printf("CREATE TABLE relation_tags (relation integer, k varchar(255), v 
varchar(1024));\n");
    
    printf("COPY %lu RECORDS INTO nodes_legacy from '%s/" file_nodes "' USING 
DELIMITERS ',', '\\n', '''';\n", count_nodes, current);
#ifdef BENCHMARK
    printf("COPY %lu RECORDS INTO nodes_legacy_uint from '%s/" file_nodes_uint 
"' USING DELIMITERS ',', '\\n', '''';\n", count_nodes, current);
    printf("COPY %lu RECORDS INTO nodes_legacy_gis from '%s/" file_nodes_gis "' 
USING DELIMITERS ',', '\\n', '''';\n", count_nodes, current);
#endif
    printf("COPY %lu RECORDS INTO node_tags from '%s/" file_node_tags "' USING 
DELIMITERS ',', '\\n', '''';\n", count_node_tags, current);
    printf("COPY %lu RECORDS INTO ways from '%s/" file_ways "' USING DELIMITERS 
',', '\\n', '''';\n", count_ways, current);
    printf("COPY %lu RECORDS INTO way_tags from '%s/" file_way_tags "' USING 
DELIMITERS ',', '\\n', '''';\n", count_way_tags, current);
    printf("COPY %lu RECORDS INTO way_nds from '%s/" file_way_nds "' USING 
DELIMITERS ',', '\\n', '''';\n", count_way_nds, current);
    printf("COPY %lu RECORDS INTO relations from '%s/" file_relations "' USING 
DELIMITERS ',', '\\n', '''';\n", count_relations, current);
    printf("COPY %lu RECORDS INTO relation_tags from '%s/" file_relation_tags 
"' USING DELIMITERS ',', '\\n', '''';\n", count_relation_tags, current);
    printf("COPY %lu RECORDS INTO relation_members_node from '%s/" 
file_relation_member_node "' USING DELIMITERS ',', '\\n', '''';\n", 
count_members_node, current);
    printf("COPY %lu RECORDS INTO relation_members_relation from '%s/" 
file_relation_member_relation "' USING DELIMITERS ',', '\\n', '''';\n", 
count_members_relation, current);
    printf("COPY %lu RECORDS INTO relation_members_way from '%s/" 
file_relation_member_way "' USING DELIMITERS ',', '\\n', '''';\n", 
count_members_way, current);

    printf("COMMIT;\n");

    printf("START TRANSACTION;\n");
    
    printf("CREATE SEQUENCE s_nodes AS INTEGER;\n");
    printf("ALTER SEQUENCE s_nodes RESTART WITH (SELECT MAX(id) FROM 
nodes_legacy);\n");
    printf("ALTER TABLE nodes_legacy ALTER COLUMN id SET NOT NULL;\n");
    printf("ALTER TABLE nodes_legacy ALTER COLUMN id SET DEFAULT NEXT VALUE FOR 
\"sys\".\"s_nodes\";\n");
    printf("ALTER TABLE nodes_legacy ADD CONSTRAINT pk_nodes_id PRIMARY KEY 
(id);\n");

    printf("CREATE SEQUENCE s_ways AS INTEGER;\n");
    printf("ALTER SEQUENCE s_ways RESTART WITH (SELECT MAX(id) FROM ways);\n");
    printf("ALTER TABLE ways ALTER COLUMN id SET NOT NULL;\n");
    printf("ALTER TABLE ways ALTER COLUMN id SET DEFAULT NEXT VALUE FOR 
\"sys\".\"s_ways\";\n");
    printf("ALTER TABLE ways ADD CONSTRAINT pk_ways_id PRIMARY KEY (id);\n");

    printf("CREATE SEQUENCE s_relations AS INTEGER;\n");
    printf("ALTER SEQUENCE s_relations RESTART WITH (SELECT MAX(id) FROM 
relations);\n");
    printf("ALTER TABLE relations ALTER COLUMN id SET NOT NULL;\n");
    printf("ALTER TABLE relations ALTER COLUMN id SET DEFAULT NEXT VALUE FOR 
\"sys\".\"s_relations\";\n");
    printf("ALTER TABLE relations ADD CONSTRAINT pk_relations_id PRIMARY KEY 
(id);\n");

    printf("ALTER TABLE relation_members_node ADD CONSTRAINT 
pk_relation_members_node PRIMARY KEY (relation, idx);\n");
    printf("ALTER TABLE relation_members_way ADD CONSTRAINT 
pk_relation_members_way PRIMARY KEY (relation,idx);\n");
    printf("ALTER TABLE relation_members_relation ADD CONSTRAINT 
pk_relation_members_relation PRIMARY KEY (relation,idx);\n");

    printf("COMMIT;\n");


    printf("START TRANSACTION;\n");
    
    printf("ALTER TABLE node_tags ADD CONSTRAINT pk_node_tags PRIMARY KEY 
(node, k);\n");
    printf("ALTER TABLE node_tags ADD CONSTRAINT fk_node_tags_node FOREIGN KEY 
(node) REFERENCES nodes_legacy (id);\n");

    printf("ALTER TABLE way_tags ADD CONSTRAINT pk_way_tags PRIMARY KEY (way, 
k);\n");
    printf("ALTER TABLE way_tags ADD CONSTRAINT fk_way_tags_way FOREIGN KEY 
(way) REFERENCES ways (id);\n");

    printf("ALTER TABLE way_nds ADD CONSTRAINT pk_way_nds PRIMARY KEY (way, 
idx);\n");
    printf("ALTER TABLE way_nds ADD CONSTRAINT fk_way_nds_way FOREIGN KEY (way) 
REFERENCES ways (id);\n");
    printf("ALTER TABLE way_nds ADD CONSTRAINT fk_way_nds_node FOREIGN KEY 
(to_node) REFERENCES nodes_legacy (id);\n");

    printf("ALTER TABLE relation_tags ADD CONSTRAINT pk_relation_tags PRIMARY 
KEY (relation, k);\n");
    printf("ALTER TABLE relation_tags ADD CONSTRAINT fk_relation_tags FOREIGN 
KEY (relation) REFERENCES relations (id);\n");

    printf("ALTER TABLE relation_members_node ADD CONSTRAINT 
fk_relation_members_node FOREIGN KEY (relation) REFERENCES relations (id);\n");
    printf("ALTER TABLE relation_members_node ADD CONSTRAINT 
fk_relation_members_tonode FOREIGN KEY (to_node) REFERENCES nodes_legacy 
(id);\n");

    printf("ALTER TABLE relation_members_way ADD CONSTRAINT 
fk_relation_members_way FOREIGN KEY (relation) REFERENCES relations (id);\n");
    printf("ALTER TABLE relation_members_way ADD CONSTRAINT 
fk_relation_members_toway FOREIGN KEY (to_way) REFERENCES ways (id);\n");

    printf("ALTER TABLE relation_members_relation ADD CONSTRAINT 
fk_relation_members_relation FOREIGN KEY (relation) REFERENCES relations 
(id);\n");
    printf("ALTER TABLE relation_members_relation ADD CONSTRAINT 
fk_relation_members_torelation FOREIGN KEY (to_relation) REFERENCES relations 
(id);\n");

    printf("COMMIT;\n");

    free(current);

}


int main(int argc, char *argv[]) {
    parser();
    exit(0);
}
_______________________________________________
dev mailing list
dev@openstreetmap.org
http://lists.openstreetmap.org/listinfo/dev

Reply via email to