Guess I ought to send the patch.

On 9/24/06, Edward Duffy <[EMAIL PROTECTED]> wrote:
Here's a patch for src/trackerd/tracker-metadata.c that can use
internal metadata extractors, depending on mimetype.  If no internal
extractor is provided, it falls back to the old method using
tracker-extract.  I've included extractors for Oasis (Open Office
files), postscript, and AbiWord.  I'm looking at the PDF parser in
libextractor's repos, and will hopefully have a patch for that soon.

So far, this is just extraction, no metadata replacement.

The interface is a little different than the one proposed by Jamie the
other day:

void tracker_metadata_extract_oasis (gchar *, GHashTable *);

The metadata hashtable is a parameter instead of creating a new one
and returning it.

Index: src/trackerd/Makefile.am
===================================================================
RCS file: /cvs/gnome/tracker/src/trackerd/Makefile.am,v
retrieving revision 1.10
diff -u -p -r1.10 Makefile.am
--- src/trackerd/Makefile.am	20 Sep 2006 23:40:51 -0000	1.10
+++ src/trackerd/Makefile.am	25 Sep 2006 01:24:27 -0000
@@ -96,6 +96,9 @@ trackerd_SOURCES =		\
 	tracker-mbox.h		\
 	tracker-metadata.c	\
 	tracker-metadata.h	\
+	tracker-metadata-oasis.c	\
+	tracker-metadata-ps.c	\
+	tracker-metadata-abw.c	\
 	tracker-rdf-query.c	\
 	tracker-rdf-query.h	\
 	tracker-stemmer-english.c	\
Index: src/trackerd/tracker-metadata.c
===================================================================
RCS file: /cvs/gnome/tracker/src/trackerd/tracker-metadata.c,v
retrieving revision 1.9
diff -u -p -r1.9 tracker-metadata.c
--- src/trackerd/tracker-metadata.c	9 Sep 2006 23:54:08 -0000	1.9
+++ src/trackerd/tracker-metadata.c	25 Sep 2006 01:24:54 -0000
@@ -127,6 +127,25 @@ char *development_mime_types[] = {
 				"text/x-tcl"
 };
 
+typedef void (*MetadataExtractFunc)(gchar *, GHashTable *);
+typedef struct {
+	char                 *mime;
+	MetadataExtractFunc  extractor;
+} MimeToExtractor;
+
+void tracker_metadata_extract_oasis (gchar *, GHashTable *);
+void tracker_metadata_extract_ps    (gchar *, GHashTable *);
+void tracker_metadata_extract_abw   (gchar *, GHashTable *);
+
+MimeToExtractor internal_metadata_extractors[] = {
+	{ "application/vnd.oasis.opendocument.text",         tracker_metadata_extract_oasis },
+	{ "application/vnd.oasis.opendocument.spreadsheet",  tracker_metadata_extract_oasis },
+	{ "application/vnd.oasis.opendocument.graphics",     tracker_metadata_extract_oasis },
+	{ "application/vnd.oasis.opendocument.presentation", tracker_metadata_extract_oasis },
+	{ "application/postscript",                          tracker_metadata_extract_ps    },
+	{ "application/x-abiword",                           tracker_metadata_extract_abw   },
+	{ "",                                                NULL                           }
+};
 
 static MetadataFileType
 tracker_get_metadata_type (const char *mime)
@@ -399,15 +418,32 @@ tracker_metadata_get_thumbnail (const ch
 	return NULL;
 }
 
+static void log_metadata_cb (gpointer key, gpointer value, gpointer user_data)
+{
+	tracker_log ("%s = %s", (gchar *)key, (gchar *)value);
+}
 
 void
 tracker_metadata_get_embedded (const char *uri, const char *mime, GHashTable *table)
 {
+	MimeToExtractor  *p;
 	MetadataFileType meta_type;
+	gboolean         found;
 
 	if (!uri || !mime || !table) {
 		return;
 	}
+
+	found = FALSE;
+	for (p = internal_metadata_extractors; p->extractor; ++p) {
+		if (strcmp (p->mime, mime) == 0) {
+			found = TRUE;
+			(*p->extractor)(uri, table);
+			g_hash_table_foreach (table, log_metadata_cb, NULL);
+		}
+	}
+	if (found)
+		return;
 
 	meta_type = tracker_get_metadata_type (mime);
 
--- /dev/null	2006-08-05 19:53:54.000000000 -0400
+++ src/trackerd/tracker-metadata-ps.c	2006-09-24 20:46:27.000000000 -0400
@@ -0,0 +1,58 @@
+
+#include <stdio.h>
+#include <string.h>
+#include <glib.h>
+
+void tracker_metadata_extract_ps (gchar *filename, GHashTable *metadata)
+{
+	FILE        *f;
+	gchar       *line;
+	gsize        length = 0;
+	gboolean     pageno_atend = FALSE;
+	gboolean     header_finished = FALSE;
+   
+	if(f = fopen (filename, "r")) {
+		line = NULL;
+		getline (&line, &length, f);
+		while (!feof (f)) {
+			line[strlen(line) - 1] = '\0';  /* overwrite \n char */
+			if (!header_finished
+					&& strncmp (line, "%%Copyright:", 12) == 0) {
+				g_hash_table_insert (metadata,
+					g_strdup ("File.Other"), g_strdup (line+13));
+			}
+			else if (!header_finished
+					&& strncmp (line, "%%Title:", 8) == 0) {
+				g_hash_table_insert (metadata,
+					g_strdup ("Doc.Title"), g_strdup (line+9));
+			}
+			else if (!header_finished
+					&& strncmp (line, "%%Creator:", 10) == 0) {
+				g_hash_table_insert (metadata,
+					g_strdup ("Doc.Author"), g_strdup (line+11));
+			}
+			else if (!header_finished
+					&& strncmp (line, "%%CreationDate:", 15) == 0) {
+				g_hash_table_insert (metadata,
+					g_strdup ("Doc.Created"), g_strdup (line+16));
+			}
+			else if (strncmp (line, "%%Pages:", 8) == 0) {
+				if (strcmp (line+9, "(atend)") == 0)
+					pageno_atend = TRUE;
+				else
+					g_hash_table_insert (metadata,
+						g_strdup ("Doc.PageCount"), g_strdup (line+9));
+			}
+			else if (strncmp (line, "%%EndComments", 14) == 0) {
+				header_finished = TRUE;
+				if (!pageno_atend)
+					break;
+			}
+			g_free (line);
+			line = NULL;
+			getline (&line, &length, f);
+		}
+		g_free (line);
+	}
+	fclose (f);
+}
--- /dev/null	2006-08-05 19:53:54.000000000 -0400
+++ src/trackerd/tracker-metadata-oasis.c	2006-09-24 21:15:08.000000000 -0400
@@ -0,0 +1,181 @@
+
+#include <string.h>
+#include <glib.h>
+
+typedef enum {
+		READ_TITLE,
+		READ_SUBJECT,
+		READ_AUTHOR,
+		READ_KEYWORDS,
+		READ_COMMENTS,
+		READ_STATS,
+		READ_CREATED,
+		READ_FILE_OTHER
+	} tag_type;
+
+typedef struct {
+	GHashTable *metadata;
+	tag_type current;
+} ODTParseInfo;
+
+static void start_element_handler (GMarkupParseContext *context,
+                                   const gchar *element_name,
+                                   const gchar **attribute_names,
+                                   const gchar **attribute_values,
+                                   gpointer user_data,
+                                   GError **error);
+
+static void end_element_handler (GMarkupParseContext *context,
+                                 const gchar *element_name,
+                                 gpointer user_data,
+                                 GError **error);
+
+static void text_handler (GMarkupParseContext *context,
+                          const gchar *text,
+                          gsize text_len,
+                          gpointer user_data,
+                          GError **error);
+
+void tracker_metadata_extract_oasis (gchar *filename, GHashTable *metadata)
+{
+
+	gchar         *argv[5];
+	gchar         *xml;
+	ODTParseInfo   info = { metadata, -1 };
+
+	argv[0] = g_strdup ("unzip");
+	argv[1] = g_strdup ("-p");
+	argv[2] = g_strdup (filename);
+	argv[3] = g_strdup ("meta.xml");
+	argv[4] = NULL;
+
+	if(g_spawn_sync (NULL,
+                    argv,
+                    NULL,
+                    G_SPAWN_SEARCH_PATH | G_SPAWN_STDERR_TO_DEV_NULL,
+                    NULL,
+                    NULL,
+                    &xml,
+                    NULL,
+                    NULL,
+                    NULL)) {
+
+		GMarkupParseContext  *context;
+		GMarkupParser         parser = {
+				start_element_handler,
+				end_element_handler,
+				text_handler,
+				NULL,
+				NULL
+			};
+
+		context = g_markup_parse_context_new (&parser, 0, &info, NULL);
+		g_markup_parse_context_parse (context, xml, -1, NULL);
+
+		g_markup_parse_context_free (context);
+		g_free (xml);
+	}
+
+	g_free (argv[3]);
+	g_free (argv[2]);
+	g_free (argv[1]);
+	g_free (argv[0]);
+
+}
+
+void start_element_handler (GMarkupParseContext *context,
+                            const gchar *element_name,
+                            const gchar **attribute_names,
+                            const gchar **attribute_values,
+                            gpointer user_data,
+                            GError **error)
+{
+	if(strcmp(element_name, "dc:title") == 0) {
+		((ODTParseInfo *)user_data)->current = READ_TITLE;
+	}
+	else if(strcmp(element_name, "dc:subject") == 0) {
+		((ODTParseInfo *)user_data)->current = READ_SUBJECT;
+	}
+	else if(strcmp(element_name, "dc:creator") == 0) {
+		((ODTParseInfo *)user_data)->current = READ_AUTHOR;
+	}
+	else if(strcmp(element_name, "meta:keyword") == 0) {
+		((ODTParseInfo *)user_data)->current = READ_KEYWORDS;
+	}
+	else if(strcmp(element_name, "dc:description") == 0) {
+		((ODTParseInfo *)user_data)->current = READ_COMMENTS;
+	}
+	else if(strcmp(element_name, "meta:document-statistic") == 0) {
+		GHashTable *metadata = ((ODTParseInfo *)user_data)->metadata;
+		const gchar **a, **v;
+		for(a=attribute_names,v=attribute_values; *a; ++a,++v) {
+			if (strcmp (*a, "meta:word-count") == 0) {
+				g_hash_table_insert (metadata,
+					g_strdup("Doc.WordCount"), g_strdup (*v));
+			}
+			else if (strcmp (*a, "meta:page-count") == 0) {
+				g_hash_table_insert (metadata,
+					g_strdup("Doc.PageCount"), g_strdup (*v));
+			}
+		}
+		((ODTParseInfo *)user_data)->current = READ_STATS;
+	}
+	else if(strcmp(element_name, "meta:creation-date") == 0) {
+		((ODTParseInfo *)user_data)->current = READ_CREATED;
+	}
+	else if(strcmp(element_name, "meta:generator") == 0) {
+		((ODTParseInfo *)user_data)->current = READ_FILE_OTHER;
+	}
+	else {
+		((ODTParseInfo *)user_data)->current = -1;
+	}
+}
+
+void end_element_handler (GMarkupParseContext *context,
+                          const gchar *element_name,
+                          gpointer user_data,
+                          GError **error)
+{
+	((ODTParseInfo *)user_data)->current = -1;
+}
+
+void text_handler (GMarkupParseContext *context,
+                   const gchar *text,
+                   gsize text_len,
+                   gpointer user_data,
+                   GError **error)
+{
+	GHashTable *metadata = ((ODTParseInfo *)user_data)->metadata;
+
+	switch(((ODTParseInfo *)user_data)->current) {
+		case READ_TITLE:
+			g_hash_table_insert (metadata, g_strdup("Doc.Title"), g_strdup (text));
+			break;
+		case READ_SUBJECT:
+			g_hash_table_insert (metadata, g_strdup("Doc.Subject"), g_strdup (text));
+			break;
+		case READ_AUTHOR:
+			g_hash_table_insert (metadata, g_strdup("Doc.Author"), g_strdup (text));
+			break;
+		case READ_KEYWORDS: {
+				gchar *keywords;
+				if (keywords = g_hash_table_lookup (metadata, "Doc.Keywords")) {
+					g_hash_table_replace (metadata, "Doc.Keywords",
+							g_strconcat (keywords, ",", text, NULL));
+				}
+				else {
+					g_hash_table_insert (metadata, g_strdup("Doc.Keywords"), g_strdup (text));
+				}
+			}
+			break;
+		case READ_COMMENTS:
+			g_hash_table_insert (metadata, g_strdup("Doc.Comments"), g_strdup (text));
+			break;
+		case READ_CREATED:
+			g_hash_table_insert (metadata, g_strdup("Doc.Created"), g_strdup (text));
+			break;
+		case READ_FILE_OTHER:
+			g_hash_table_insert (metadata, g_strdup("File.Other"), g_strdup (text));
+			break;
+	}
+}
--- /dev/null	2006-08-05 19:53:54.000000000 -0400
+++ src/trackerd/tracker-metadata-abw.c	2006-09-24 20:44:08.000000000 -0400
@@ -0,0 +1,47 @@
+
+#include <string.h>
+#include <stdio.h>
+#include <glib.h>
+
+void tracker_metadata_extract_abw (gchar *filename, GHashTable *metadata)
+{
+	FILE    *f;
+	gchar   *line;
+	gsize    length = 0;
+   
+	if(f = fopen (filename, "r")) {
+		line = NULL;
+		getline (&line, &length, f);
+		while (!feof (f)) {
+			if (g_str_has_suffix (line, "</m>\n")) {
+				line[strlen(line) - 5] = '\0';
+			}
+			if (g_str_has_prefix (line, "<m key=\"dc.title\">")) {
+				g_hash_table_insert (metadata,
+					g_strdup ("Doc.Title"), g_strdup (line+18));
+			}
+			else if (g_str_has_prefix (line, "<m key=\"dc.subject\">")) {
+				g_hash_table_insert (metadata,
+					g_strdup ("Doc.Subject"), g_strdup (line+20));
+			}
+			else if (g_str_has_prefix (line, "<m key=\"dc.creator\">")) {
+				g_hash_table_insert (metadata,
+					g_strdup ("Doc.Author"), g_strdup (line+20));
+			}
+			else if (g_str_has_prefix (line, "<m key=\"abiword.keywords\">")) {
+				g_hash_table_insert (metadata,
+					g_strdup ("Doc.Keywords"), g_strdup (line+26));
+			}
+			else if (g_str_has_prefix (line, "<m key=\"dc.description\">")) {
+				g_hash_table_insert (metadata,
+					g_strdup ("Doc.Comments"), g_strdup (line+24));
+			}
+			g_free (line);
+			line = NULL;
+			getline (&line, &length, f);
+		}
+		g_free (line);
+	}
+	fclose (f);
+}
+
_______________________________________________
tracker-list mailing list
[email protected]
http://mail.gnome.org/mailman/listinfo/tracker-list

Reply via email to