Hi all,
On many requests I have created an osmparser.c that instead on real
files exclusively operates on stdin. Why would this benefit you? Most
multiprocessor systems do nothing with their second processor, so in
this case one processor can do bzip2 -d -c theplanet.osm.bz and the
other one can run osmsucker.c. This gives you a parser with a relative
easy to read output mechanism to convert the planet to a more suitable
format for bulk loading. Theoretically it could be faster, practically I
didn't see the difference (yet). By default it uses my (old) normalised
OSM schema.
Because I have designed a new storage model; newer version will probably
include this model.
http://repo.or.cz/w/handlerosm.git?a=blob;f=osmsucker.c;hb=HEAD
I was informed on the controversial meaning of the word suck in relation
to unix pipes. Never the less, the commasex^Hparatedvalues cu^Homing out
of this program are pretty harmless. To prevent viral infections I have
licensed the code under STD, STefan Delicensed, it allows you to do
anything with the code except; make war.
Stefan
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>
#include <errno.h>
#include <math.h>
#include <unistd.h>
/*
* <osm>
* <node>
* <tag k=".." v=".." />
* </node>
* <way>
* <nd>
* <tag>
* </way>
* <relation>
* <member>
* <tag>
* </relation>
*/
#define nextline free(start); start = NULL; tmp = getline(&start, &tmplen,
stdin); if (tmp == -1) { goto exit; }
//#define nextline start = end + 1
#define file_nodes "nodes.csv"
#ifdef BENCHMARK
#define file_nodes_uint "nodes_uint.csv"
#define file_nodes_gis "nodes_gis.csv"
#endif
#define file_node_tags "node_tags.csv"
#define file_ways "ways.csv"
#define file_way_tags "way_tags.csv"
#define file_way_nds "way_nds.csv"
#define file_relations "relations.csv"
#define file_relation_tags "relation_tags.csv"
#define file_relation_member_node "relation_member_node.csv"
#define file_relation_member_relation "relation_member_relation.csv"
#define file_relation_member_way "relation_member_way.csv"
unsigned int coordtouint(char *input) {
double maxbit = (double) 4294967296.0 / (double) 360.0;
double proper = strtod(input, NULL) * maxbit;
return (unsigned int) proper;
}
char * escape_string(char *instr)
{
unsigned int i, j=0, need = 0;
unsigned int len = strlen(instr);
char *outstr;
for (i=0;i<len;i++)
if (instr[i]=='\\' || instr[i]=='\'') need++;
len += need;
outstr = malloc(len + 1);
for (i=0;i<=strlen(instr);i++) {
if (instr[i]=='\\' || instr[i]=='\'')
outstr[j++]='\\';
outstr[j++]=instr[i];
}
return outstr;
}
static void parser() {
typedef enum { OSM = 0, NODE = 1, WAY = 2, RELATION = 3, TAG = 4, ND = 5,
MEMBER = 6 } osm_state_t;
typedef enum { UNKNOWN = 0, ID, LAT, LON, USER, UID, TIMESTAMP, KEY, VALUE,
TYPE, REF, ROLE} key_state_t;
char *attr_id = NULL, *attr_lat = NULL, *attr_lon = NULL, *attr_user =
NULL, *attr_uid = NULL, *attr_timestamp = NULL, *attr_key = NULL, *attr_value =
NULL,
*attr_type = NULL, *attr_ref = NULL, *attr_role = NULL;
unsigned int attr_lat_uint = 0;
unsigned int attr_lon_uint = 0;
FILE *fd_nodes = fopen(file_nodes, "w");
if (fd_nodes == NULL) { perror("Open:"); exit(-1); }
#ifdef BENCHMARK
FILE *fd_nodes_uint = fopen(file_nodes_uint, "w");
if (fd_nodes_uint == NULL) { perror("Open:"); exit(-1); }
FILE *fd_nodes_gis = fopen(file_nodes_gis, "w");
if (fd_nodes_gis == NULL) { perror("Open:"); exit(-1); }
#endif
FILE *fd_node_tags = fopen(file_node_tags, "w");
if (fd_node_tags == NULL) { perror("Open:"); exit(-1); }
FILE *fd_ways = fopen(file_ways, "w");
if (fd_ways == NULL) { perror("Open:"); exit(-1); }
FILE *fd_way_tags = fopen(file_way_tags, "w");
if (fd_way_tags == NULL) { perror("Open:"); exit(-1); }
FILE *fd_way_nds = fopen(file_way_nds, "w");
if (fd_way_nds == NULL) { perror("Open:"); exit(-1); }
FILE *fd_relations = fopen(file_relations, "w");
if (fd_relations == NULL) { perror("Open:"); exit(-1); }
FILE *fd_relation_tags = fopen(file_relation_tags, "w");
if (fd_relation_tags == NULL) { perror("Open:"); exit(-1); }
FILE *fd_members_node = fopen(file_relation_member_node, "w");
if (fd_members_node == NULL) { perror("Open:"); exit(-1); }
FILE *fd_members_relation = fopen(file_relation_member_relation, "w");
if (fd_members_relation == NULL) { perror("Open:"); exit(-1); }
FILE *fd_members_way = fopen(file_relation_member_way, "w");
if (fd_members_way == NULL) { perror("Open:"); exit(-1); }
unsigned long int count_nodes = 0, count_node_tags = 0,
count_ways = 0, count_way_tags = 0, count_way_nds = 0,
count_relations = 0, count_relation_tags = 0, count_members_node = 0,
count_members_relation = 0, count_members_way = 0;
unsigned long int sequence = 0;
osm_state_t current_tag = OSM;
osm_state_t parent_tag = OSM;
char *start = NULL, *end, *nodename, *nodename_end;
ssize_t tmp;
size_t tmplen = 0;
nextline;
end = strchrnul((const char*) start, '\n');
if (strncmp(start, "<?xml", 5) != 0)
return;
nextline;
end = strchrnul((const char*) start, '\n');
if (strncmp(start, "<osm", 4) != 0)
return;
nextline;
do {
end = strchrnul((const char*) start, '\n');
nodename = strchrnul(start, '<') + 1;
nodename_end = strchrnul(nodename, ' ');
if (nodename[0] == '/') {
free(attr_id);
free(attr_lat);
free(attr_lon);
free(attr_timestamp);
free(attr_user);
free(attr_uid);
attr_id = attr_lat = attr_lon = attr_user = attr_uid = attr_timestamp =
NULL;
sequence = 0;
nextline;
continue;
}
switch (nodename_end - nodename) {
case 2:
current_tag = ND;
break;
case 3: {
switch (nodename[0]) {
case 'o':
current_tag = OSM;
break;
case 'w':
current_tag = WAY;
break;
case 't':
current_tag = TAG;
break;
default:
fprintf(stderr, "--> %c%c", nodename[0],
nodename[1]);
}
break;
}
case 4:
current_tag = NODE;
break;
case 5:
nextline;
continue;
case 6:
current_tag = MEMBER;
break;
case 8:
current_tag = RELATION;
break;
default:
fprintf(stderr, "--> %c%c", nodename[0], nodename[1]);
}
char *key, *key_end, *value_end;
key = nodename_end + 1;
do {
char *value;
key_state_t current_key = UNKNOWN;
key_end = strchrnul(key, '=');
if (key_end == NULL || key_end >= end)
break;
switch (key_end - key) {
case 1: {
switch (key[0]) {
case 'k':
current_key = KEY;
break;
case 'v':
current_key = VALUE;
break;
default:
current_key = UNKNOWN;
}
break;
}
case 2:
current_key = ID;
break;
case 3: {
switch (key[1]) {
case 'a':
current_key = LAT;
break;
case 'o':
current_key = LON;
break;
case 'e':
current_key = REF;
break;
case 'i':
current_key = UID;
break;
default:
current_key = UNKNOWN;
fprintf(stderr, "--> %c%c\n", key[0], key[1]);
}
break;
}
case 4: {
switch (key[0]) {
case 'u':
current_key = USER;
break;
case 'r':
current_key = ROLE;
break;
case 't':
current_key = TYPE;
break;
default:
current_key = UNKNOWN;
fprintf(stderr, "--> %c%c\n", key[0], key[1]);
}
break;
}
case 9:
current_key = TIMESTAMP;
break;
default: {
char *thingie = strndup(key, (key_end - key));
current_key = UNKNOWN;
fprintf(stderr, "UNKNOWN ATTR %s-> %c%c\n", thingie,
key[0], key[1]);
free(thingie);
}
}
value = key_end + 2;
value_end = value;
value_end = strchr(value_end, '"');
if (value_end > end)
break;
switch (current_key) {
case ID:
if (attr_id) free(attr_id);
attr_id = strndup(value, (value_end - value));
break;
case LAT:
if (attr_lat) free(attr_lat);
attr_lat = strndup(value, (value_end - value));
attr_lat_uint = coordtouint(attr_lat);
break;
case LON:
if (attr_lon) free(attr_lon);
attr_lon = strndup(value, (value_end - value));
attr_lon_uint = coordtouint(attr_lon);
break;
case TIMESTAMP:
if (attr_timestamp) free(attr_timestamp);
// attr_timestamp = strndup(value, (value_end - value));
attr_timestamp = strndup(value, (value_end - (value + 1))); /*
another stupid fix */
// attr_timestamp[10] = ' '; /* Stupid timestamp fix */
break;
case USER: {
char *tmp;
if (attr_user) free(attr_user);
attr_user = strndup(value, (value_end - value));
tmp = escape_string(attr_user);
free(attr_user);
attr_user = tmp;
break;
}
case UID: {
if (attr_uid) free(attr_uid);
attr_uid = strndup(value, (value_end - value));
break;
}
case KEY: {
char *tmp;
if (attr_key) free(attr_key);
attr_key = strndup(value, (value_end - value));
tmp = escape_string(attr_key);
free(attr_key);
attr_key = tmp;
break;
}
case VALUE: {
char *tmp;
if (attr_value) free(attr_value);
attr_value = strndup(value, (value_end - value));
tmp = escape_string(attr_value);
free(attr_value);
attr_value = tmp;
break;
}
case TYPE:
if (attr_type) free(attr_type);
attr_type = strndup(value, (value_end - value));
break;
case REF:
if (attr_ref) free(attr_ref);
attr_ref = strndup(value, (value_end - value));
break;
case ROLE: {
char *tmp;
if (attr_role) free(attr_role);
attr_role = strndup(value, (value_end - value));
tmp = escape_string(attr_role);
free(attr_role);
attr_role = tmp;
break;
}
default:
fprintf(stderr, "--> %c%c\n", value[0], value[1]);
}
key = value_end + 2;
} while (key < end);
switch (current_tag) {
case NODE:
fprintf(fd_nodes, "%s, %s, %s, '%s', '%s'\n", attr_id, attr_lat,
attr_lon, attr_uid, attr_timestamp);
#ifdef BENCHMARK
fprintf(fd_nodes_uint, "%s, %d, %d, '%s', '%s'\n", attr_id,
attr_lat_uint, attr_lon_uint, attr_uid, attr_timestamp);
fprintf(fd_nodes_gis, "%s, 'POINT( %s %s )', '%s', '%s'\n",
attr_id, attr_lon, attr_lat, attr_uid, attr_timestamp);
#endif
count_nodes++;
break;
case TAG: {
switch (parent_tag) {
case NODE:
fprintf(fd_node_tags, "%s, '%s', '%s'\n", attr_id,
attr_key, attr_value);
count_node_tags++;
break;
case WAY:
fprintf(fd_way_tags, "%s, '%s', '%s'\n", attr_id,
attr_key, attr_value);
count_way_tags++;
break;
case RELATION:
fprintf(fd_relation_tags, "%s, '%s', '%s'\n",
attr_id, attr_key, attr_value);
count_relation_tags++;
break;
default:
break;
}
break;
}
case WAY:
fprintf(fd_ways, "%s, '%s', '%s'\n", attr_id, attr_uid,
attr_timestamp);
count_ways++;
// fprintf(fd_way_tags, "%s, '%s', '%s'\n", attr_id, "type", "way");
// count_way_tags++;
break;
case RELATION:
fprintf(fd_relations, "%s, '%s', '%s'\n", attr_id, attr_uid,
attr_timestamp);
count_relations++;
break;
case MEMBER:
if (strcmp(attr_type, "node") == 0) {
fprintf(fd_members_node, "%s, %lu, %s, '%s'\n",
attr_id, sequence, attr_ref, attr_role);
count_members_node++;
} else if (strcmp(attr_type, "way") == 0) {
fprintf(fd_members_way, "%s, %lu, %s, '%s'\n", attr_id,
sequence, attr_ref, attr_role);
count_members_way++;
} else if (strcmp(attr_type, "relation") == 0) {
fprintf(fd_members_relation, "%s, %lu, %s, '%s'\n",
attr_id, sequence, attr_ref, attr_role);
count_members_relation++;
}
sequence++;
break;
case ND:
fprintf(fd_way_nds, "%s, %lu, %s\n", attr_id, sequence, attr_ref);
sequence++;
count_way_nds++;
break;
default:
break;
}
if (end[-2] == '/') {
switch (current_tag) {
case NODE:
free(attr_lat);
free(attr_lon);
attr_lat = NULL;
attr_lon = NULL;
attr_lat_uint = 0;
attr_lon_uint = 0;
/* no break! */
case WAY:
case RELATION:
free(attr_id);
free(attr_timestamp);
free(attr_user);
free(attr_uid);
attr_id = attr_user = attr_uid = attr_timestamp = NULL;
sequence = 0;
break;
case TAG:
free(attr_key);
free(attr_value);
attr_key = NULL;
attr_value = NULL;
break;
case ND:
case MEMBER:
free(attr_type);
free(attr_ref);
free(attr_role);
attr_type = NULL;
attr_ref = NULL;
attr_role = NULL;
default:
break;
}
} else if (current_tag == NODE || current_tag == WAY || current_tag ==
RELATION) {
parent_tag = current_tag;
}
nextline;
// } while ((start = ++end) < (range + max));
} while (1);
exit:
free(attr_id);
free(attr_lat);
free(attr_lon);
free(attr_timestamp);
free(attr_user);
free(attr_uid);
free(attr_key);
free(attr_value);
fclose(fd_nodes);
#ifdef BENCHMARK
fclose(fd_nodes_uint);
fclose(fd_nodes_gis);
#endif
fclose(fd_node_tags);
fclose(fd_ways);
fclose(fd_way_tags);
fclose(fd_way_nds);
fclose(fd_relations);
fclose(fd_relation_tags);
fclose(fd_members_node);
fclose(fd_members_relation);
char *current = get_current_dir_name();
printf("START TRANSACTION;\n");
printf("CREATE TABLE nodes_legacy (id integer, long double, lat double, uid
long, timestamp timestamptz);\n");
#ifdef BENCHMARK
printf("CREATE TABLE nodes_legacy_uint (id integer, long integer, lat
integer, uid long, timestamp timestamptz);\n");
printf("CREATE TABLE nodes_legacy_gis (id integer, poi point, uid long,
timestamp timestamptz);\n");
#endif
printf("CREATE TABLE node_tags (node integer, k varchar(255), v
varchar(1024));\n");
printf("CREATE TABLE ways (id integer,uid long, timestamp timestamptz);\n");
printf("CREATE TABLE way_tags (way integer, k varchar(255), v
varchar(1024));\n");
printf("CREATE TABLE way_nds (way integer, idx integer, to_node
integer);\n");
printf("CREATE TABLE relations(id integer, uid long, timestamp
timestamptz);\n");
printf("CREATE TABLE relation_members_node (relation integer, idx integer,
to_node integer, role varchar(255));\n");
printf("CREATE TABLE relation_members_relation (relation integer, idx
integer, to_relation integer, role varchar(255));\n");
printf("CREATE TABLE relation_members_way (relation integer, idx integer,
to_way integer, role varchar(255));\n");
printf("CREATE TABLE relation_tags (relation integer, k varchar(255), v
varchar(1024));\n");
printf("COPY %lu RECORDS INTO nodes_legacy from '%s/" file_nodes "' USING
DELIMITERS ',', '\\n', '''';\n", count_nodes, current);
#ifdef BENCHMARK
printf("COPY %lu RECORDS INTO nodes_legacy_uint from '%s/" file_nodes_uint
"' USING DELIMITERS ',', '\\n', '''';\n", count_nodes, current);
printf("COPY %lu RECORDS INTO nodes_legacy_gis from '%s/" file_nodes_gis "'
USING DELIMITERS ',', '\\n', '''';\n", count_nodes, current);
#endif
printf("COPY %lu RECORDS INTO node_tags from '%s/" file_node_tags "' USING
DELIMITERS ',', '\\n', '''';\n", count_node_tags, current);
printf("COPY %lu RECORDS INTO ways from '%s/" file_ways "' USING DELIMITERS
',', '\\n', '''';\n", count_ways, current);
printf("COPY %lu RECORDS INTO way_tags from '%s/" file_way_tags "' USING
DELIMITERS ',', '\\n', '''';\n", count_way_tags, current);
printf("COPY %lu RECORDS INTO way_nds from '%s/" file_way_nds "' USING
DELIMITERS ',', '\\n', '''';\n", count_way_nds, current);
printf("COPY %lu RECORDS INTO relations from '%s/" file_relations "' USING
DELIMITERS ',', '\\n', '''';\n", count_relations, current);
printf("COPY %lu RECORDS INTO relation_tags from '%s/" file_relation_tags
"' USING DELIMITERS ',', '\\n', '''';\n", count_relation_tags, current);
printf("COPY %lu RECORDS INTO relation_members_node from '%s/"
file_relation_member_node "' USING DELIMITERS ',', '\\n', '''';\n",
count_members_node, current);
printf("COPY %lu RECORDS INTO relation_members_relation from '%s/"
file_relation_member_relation "' USING DELIMITERS ',', '\\n', '''';\n",
count_members_relation, current);
printf("COPY %lu RECORDS INTO relation_members_way from '%s/"
file_relation_member_way "' USING DELIMITERS ',', '\\n', '''';\n",
count_members_way, current);
printf("COMMIT;\n");
printf("START TRANSACTION;\n");
printf("CREATE SEQUENCE s_nodes AS INTEGER;\n");
printf("ALTER SEQUENCE s_nodes RESTART WITH (SELECT MAX(id) FROM
nodes_legacy);\n");
printf("ALTER TABLE nodes_legacy ALTER COLUMN id SET NOT NULL;\n");
printf("ALTER TABLE nodes_legacy ALTER COLUMN id SET DEFAULT NEXT VALUE FOR
\"sys\".\"s_nodes\";\n");
printf("ALTER TABLE nodes_legacy ADD CONSTRAINT pk_nodes_id PRIMARY KEY
(id);\n");
printf("CREATE SEQUENCE s_ways AS INTEGER;\n");
printf("ALTER SEQUENCE s_ways RESTART WITH (SELECT MAX(id) FROM ways);\n");
printf("ALTER TABLE ways ALTER COLUMN id SET NOT NULL;\n");
printf("ALTER TABLE ways ALTER COLUMN id SET DEFAULT NEXT VALUE FOR
\"sys\".\"s_ways\";\n");
printf("ALTER TABLE ways ADD CONSTRAINT pk_ways_id PRIMARY KEY (id);\n");
printf("CREATE SEQUENCE s_relations AS INTEGER;\n");
printf("ALTER SEQUENCE s_relations RESTART WITH (SELECT MAX(id) FROM
relations);\n");
printf("ALTER TABLE relations ALTER COLUMN id SET NOT NULL;\n");
printf("ALTER TABLE relations ALTER COLUMN id SET DEFAULT NEXT VALUE FOR
\"sys\".\"s_relations\";\n");
printf("ALTER TABLE relations ADD CONSTRAINT pk_relations_id PRIMARY KEY
(id);\n");
printf("ALTER TABLE relation_members_node ADD CONSTRAINT
pk_relation_members_node PRIMARY KEY (relation, idx);\n");
printf("ALTER TABLE relation_members_way ADD CONSTRAINT
pk_relation_members_way PRIMARY KEY (relation,idx);\n");
printf("ALTER TABLE relation_members_relation ADD CONSTRAINT
pk_relation_members_relation PRIMARY KEY (relation,idx);\n");
printf("COMMIT;\n");
printf("START TRANSACTION;\n");
printf("ALTER TABLE node_tags ADD CONSTRAINT pk_node_tags PRIMARY KEY
(node, k);\n");
printf("ALTER TABLE node_tags ADD CONSTRAINT fk_node_tags_node FOREIGN KEY
(node) REFERENCES nodes_legacy (id);\n");
printf("ALTER TABLE way_tags ADD CONSTRAINT pk_way_tags PRIMARY KEY (way,
k);\n");
printf("ALTER TABLE way_tags ADD CONSTRAINT fk_way_tags_way FOREIGN KEY
(way) REFERENCES ways (id);\n");
printf("ALTER TABLE way_nds ADD CONSTRAINT pk_way_nds PRIMARY KEY (way,
idx);\n");
printf("ALTER TABLE way_nds ADD CONSTRAINT fk_way_nds_way FOREIGN KEY (way)
REFERENCES ways (id);\n");
printf("ALTER TABLE way_nds ADD CONSTRAINT fk_way_nds_node FOREIGN KEY
(to_node) REFERENCES nodes_legacy (id);\n");
printf("ALTER TABLE relation_tags ADD CONSTRAINT pk_relation_tags PRIMARY
KEY (relation, k);\n");
printf("ALTER TABLE relation_tags ADD CONSTRAINT fk_relation_tags FOREIGN
KEY (relation) REFERENCES relations (id);\n");
printf("ALTER TABLE relation_members_node ADD CONSTRAINT
fk_relation_members_node FOREIGN KEY (relation) REFERENCES relations (id);\n");
printf("ALTER TABLE relation_members_node ADD CONSTRAINT
fk_relation_members_tonode FOREIGN KEY (to_node) REFERENCES nodes_legacy
(id);\n");
printf("ALTER TABLE relation_members_way ADD CONSTRAINT
fk_relation_members_way FOREIGN KEY (relation) REFERENCES relations (id);\n");
printf("ALTER TABLE relation_members_way ADD CONSTRAINT
fk_relation_members_toway FOREIGN KEY (to_way) REFERENCES ways (id);\n");
printf("ALTER TABLE relation_members_relation ADD CONSTRAINT
fk_relation_members_relation FOREIGN KEY (relation) REFERENCES relations
(id);\n");
printf("ALTER TABLE relation_members_relation ADD CONSTRAINT
fk_relation_members_torelation FOREIGN KEY (to_relation) REFERENCES relations
(id);\n");
printf("COMMIT;\n");
free(current);
}
int main(int argc, char *argv[]) {
parser();
exit(0);
}
_______________________________________________
dev mailing list
dev@openstreetmap.org
http://lists.openstreetmap.org/listinfo/dev