Hi all Here is my old IMDB conversion tool (C code), and the schema I used. It uses only Actors and Movies, filtering out TV series and DVDs.
1484084 actors \_ actors 866799 actresses / 574006 movies 5900319 actors_movies actors.list.gz and actresses.list.gz files from ftp://ftp.fu-berlin.de/pub/misc/movies/database/ (via http://www.imdb.com/interfaces) My laptop is still building the actors_movies_actors table so I don't yet have a total for that, but from memory I think it was around 45 million edges. Currently I load using MyISAM with delay key writes and large key buffer size. We can possibly optimise the load methodology a bit further. Cheers, Arjen. -- Arjen Lentz, Exec.Director @ Open Query (http://openquery.com) Australian peace of mind for your MySQL/MariaDB infrastructure. Follow us at http://openquery.com/blog/ & http://twitter.com/openquery
/* ================================================================ Parse IMDB .list files (actors, actresses) to SQL) For having some more data to play with in OQGRAPH engine Copyright (C) 2009 by Arjen Lentz (ar...@openquery.com) GPLv2 or BSD license, you choose ;-) ================================================================ */ #include <stdio.h> #include <string.h> #include <ctype.h> #define MAX_LINE 1024 /* very rough trimming of leading and trailing spaces return new ptr to start of \0 string */ char *strip (char *s) { char *p; while (isspace(*s)) *s++; p = s + strlen(s); while (p > s && isspace(*--p)); *++p = '\0'; return (s); }/*strip()*/ char *title_suffix[] = { " (uncredited)", " (credit", " (as ", " (archive ", " (TV)", " (V)", " (VG)", " [", " <", NULL }; /* very rough string escape for SQL (just escapes single quote) */ char *escape_string (char *s) { static char line[MAX_LINE * 2]; char *p = line; while (*s) { if (*s == '\'') *p++ = '\\'; *p++ = *s++; } *p++ = '\0'; return (line); }/*escape_string*/ int main (int argc, char *argv[]) { char line[MAX_LINE+1]; char actor[MAX_LINE+1], longtitle[MAX_LINE+1], title[MAX_LINE+1]; char *p, *q; int i; // skip over file header while (fgets(line,1024,stdin) && strncmp(line,"Name",4)); fgets(line,1024,stdin); // just dashes actor[0] = '\0'; while (fgets(line,1024,stdin) && strncmp(line,"----",4)) { if ((p = strchr(line,'\t')) == NULL) { // skip weird lines actor[0] = '\0'; // could be end of an actor's movie list continue; } p = strip(++p); strcpy(longtitle,p); for (i = 0; title_suffix[i]; i++) { if ((q = strstr(p,title_suffix[i])) != NULL) *q = '\0'; } strcpy(title,strip(p)); if (!isspace(line[0])) { // line with actor *p = '\0'; // \0 after actor if ((p = strstr(line," (")) != NULL) *p = '\0'; // no (III) etc if ((p = strchr(line,',')) != NULL) { // last, first *p++ = '\0'; // set p to start of first name strcpy(actor,strip(p)); // first strcat(actor," "); // space strcat(actor,strip(line)); // last while ((q = strchr(actor,'\t')) != NULL) *q = ' '; } else { strcpy(actor,strip(line)); } printf("INSERT INTO actors (name) VALUES ('%s') ON DUPLICATE KEY UPDATE actor_id = LAST_INSERT_ID(actor_id);\n", escape_string(actor)); printf("SET @actor_id = LAST_INSERT_ID();\n"); } else if (!actor[0]) continue; // skip TV series/episodes and TV movies, Video only for now, and Video Games if (strstr(longtitle," {") || strstr(longtitle," (TV)") || strstr(longtitle," (V)") || strstr(longtitle," (VG)")) continue; printf("INSERT INTO movies (title) VALUES ('%s') ON DUPLICATE KEY UPDATE movie_id = LAST_INSERT_ID(movie_id);\n", escape_string(title)); printf("SET @movie_id = LAST_INSERT_ID();\n"); printf("INSERT IGNORE INTO actors_movies (actor_id,movie_id) VALUES (@actor_id,@movie_id);\n"); } return (0); }/*main()*/ /* end of imdb2csv.c */
-- We use MyISAM here because it's just a repeatable bulk load -- ACID commits would just delay stuff... -- enabling delay key write, key_buffer_size=3G CREATE TABLE actors ( actor_id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, name VARCHAR(100) NOT NULL UNIQUE ) ENGINE=MyISAM DEFAULT CHARSET=latin1 DELAY_KEY_WRITE=1; CREATE TABLE movies ( movie_id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, title VARCHAR(300) NOT NULL UNIQUE ) ENGINE=MyISAM DEFAULT CHARSET=latin1 DELAY_KEY_WRITE=1; CREATE TABLE actors_movies ( actor_movie_id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, actor_id INT UNSIGNED NOT NULL, movie_id INT UNSIGNED NOT NULL, UNIQUE (actor_id,movie_id), INDEX (movie_id) ) ENGINE=MyISAM DEFAULT CHARSET=latin1 DELAY_KEY_WRITE=1; CREATE TABLE `actors_movies_actors` ( `actor_movie_id` INT(10) UNSIGNED NOT NULL, `actor_id` INT(10) UNSIGNED NOT NULL, PRIMARY KEY (`actor_movie_id`,`actor_id`) ) ENGINE=MyISAM DEFAULT CHARSET=latin1 DELAY_KEY_WRITE=1; -- populate link table -- insert into actors_movies_actors select am1.actor_movie_id,am2.actor_id from actors_movies am1 join actors_movies am2 on (am1.movie_id=am2.movie_id AND am1.actor_id != am2.actor_id); -- populate graph table -- insert into actors_graph (origid,destid) select am.actor_id,ama.actor_id from actors_movies_actors ama join actors_movies am using (actor_movie_id);
-- Mailing list: https://launchpad.net/~oqgraph-dev Post to : oqgraph-dev@lists.launchpad.net Unsubscribe : https://launchpad.net/~oqgraph-dev More help : https://help.launchpad.net/ListHelp