Guess I ought to send the patch.
On 9/24/06, Edward Duffy <[EMAIL PROTECTED]> wrote:
Here's a patch for src/trackerd/tracker-metadata.c that can use
internal metadata extractors, depending on mimetype. If no internal
extractor is provided, it falls back to the old method using
tracker-extract. I've included extractors for Oasis (Open Office
files), postscript, and AbiWord. I'm looking at the PDF parser in
libextractor's repos, and will hopefully have a patch for that soon.
So far, this is just extraction, no metadata replacement.
The interface is a little different than the one proposed by Jamie the
other day:
void tracker_metadata_extract_oasis (gchar *, GHashTable *);
The metadata hashtable is a parameter instead of creating a new one
and returning it.
Index: src/trackerd/Makefile.am
===================================================================
RCS file: /cvs/gnome/tracker/src/trackerd/Makefile.am,v
retrieving revision 1.10
diff -u -p -r1.10 Makefile.am
--- src/trackerd/Makefile.am 20 Sep 2006 23:40:51 -0000 1.10
+++ src/trackerd/Makefile.am 25 Sep 2006 01:24:27 -0000
@@ -96,6 +96,9 @@ trackerd_SOURCES = \
tracker-mbox.h \
tracker-metadata.c \
tracker-metadata.h \
+ tracker-metadata-oasis.c \
+ tracker-metadata-ps.c \
+ tracker-metadata-abw.c \
tracker-rdf-query.c \
tracker-rdf-query.h \
tracker-stemmer-english.c \
Index: src/trackerd/tracker-metadata.c
===================================================================
RCS file: /cvs/gnome/tracker/src/trackerd/tracker-metadata.c,v
retrieving revision 1.9
diff -u -p -r1.9 tracker-metadata.c
--- src/trackerd/tracker-metadata.c 9 Sep 2006 23:54:08 -0000 1.9
+++ src/trackerd/tracker-metadata.c 25 Sep 2006 01:24:54 -0000
@@ -127,6 +127,25 @@ char *development_mime_types[] = {
"text/x-tcl"
};
+typedef void (*MetadataExtractFunc)(gchar *, GHashTable *);
+typedef struct {
+ char *mime;
+ MetadataExtractFunc extractor;
+} MimeToExtractor;
+
+void tracker_metadata_extract_oasis (gchar *, GHashTable *);
+void tracker_metadata_extract_ps (gchar *, GHashTable *);
+void tracker_metadata_extract_abw (gchar *, GHashTable *);
+
+MimeToExtractor internal_metadata_extractors[] = {
+ { "application/vnd.oasis.opendocument.text", tracker_metadata_extract_oasis },
+ { "application/vnd.oasis.opendocument.spreadsheet", tracker_metadata_extract_oasis },
+ { "application/vnd.oasis.opendocument.graphics", tracker_metadata_extract_oasis },
+ { "application/vnd.oasis.opendocument.presentation", tracker_metadata_extract_oasis },
+ { "application/postscript", tracker_metadata_extract_ps },
+ { "application/x-abiword", tracker_metadata_extract_abw },
+ { "", NULL }
+};
static MetadataFileType
tracker_get_metadata_type (const char *mime)
@@ -399,15 +418,32 @@ tracker_metadata_get_thumbnail (const ch
return NULL;
}
+static void log_metadata_cb (gpointer key, gpointer value, gpointer user_data)
+{
+ tracker_log ("%s = %s", (gchar *)key, (gchar *)value);
+}
void
tracker_metadata_get_embedded (const char *uri, const char *mime, GHashTable *table)
{
+ MimeToExtractor *p;
MetadataFileType meta_type;
+ gboolean found;
if (!uri || !mime || !table) {
return;
}
+
+ found = FALSE;
+ for (p = internal_metadata_extractors; p->extractor; ++p) {
+ if (strcmp (p->mime, mime) == 0) {
+ found = TRUE;
+ (*p->extractor)(uri, table);
+ g_hash_table_foreach (table, log_metadata_cb, NULL);
+ }
+ }
+ if (found)
+ return;
meta_type = tracker_get_metadata_type (mime);
--- /dev/null 2006-08-05 19:53:54.000000000 -0400
+++ src/trackerd/tracker-metadata-ps.c 2006-09-24 20:46:27.000000000 -0400
@@ -0,0 +1,58 @@
+
+#include <stdio.h>
+#include <string.h>
+#include <glib.h>
+
+void tracker_metadata_extract_ps (gchar *filename, GHashTable *metadata)
+{
+ FILE *f;
+ gchar *line;
+ gsize length = 0;
+ gboolean pageno_atend = FALSE;
+ gboolean header_finished = FALSE;
+
+ if(f = fopen (filename, "r")) {
+ line = NULL;
+ getline (&line, &length, f);
+ while (!feof (f)) {
+ line[strlen(line) - 1] = '\0'; /* overwrite \n char */
+ if (!header_finished
+ && strncmp (line, "%%Copyright:", 12) == 0) {
+ g_hash_table_insert (metadata,
+ g_strdup ("File.Other"), g_strdup (line+13));
+ }
+ else if (!header_finished
+ && strncmp (line, "%%Title:", 8) == 0) {
+ g_hash_table_insert (metadata,
+ g_strdup ("Doc.Title"), g_strdup (line+9));
+ }
+ else if (!header_finished
+ && strncmp (line, "%%Creator:", 10) == 0) {
+ g_hash_table_insert (metadata,
+ g_strdup ("Doc.Author"), g_strdup (line+11));
+ }
+ else if (!header_finished
+ && strncmp (line, "%%CreationDate:", 15) == 0) {
+ g_hash_table_insert (metadata,
+ g_strdup ("Doc.Created"), g_strdup (line+16));
+ }
+ else if (strncmp (line, "%%Pages:", 8) == 0) {
+ if (strcmp (line+9, "(atend)") == 0)
+ pageno_atend = TRUE;
+ else
+ g_hash_table_insert (metadata,
+ g_strdup ("Doc.PageCount"), g_strdup (line+9));
+ }
+ else if (strncmp (line, "%%EndComments", 14) == 0) {
+ header_finished = TRUE;
+ if (!pageno_atend)
+ break;
+ }
+ g_free (line);
+ line = NULL;
+ getline (&line, &length, f);
+ }
+ g_free (line);
+ }
+ fclose (f);
+}
--- /dev/null 2006-08-05 19:53:54.000000000 -0400
+++ src/trackerd/tracker-metadata-oasis.c 2006-09-24 21:15:08.000000000 -0400
@@ -0,0 +1,181 @@
+
+#include <string.h>
+#include <glib.h>
+
+typedef enum {
+ READ_TITLE,
+ READ_SUBJECT,
+ READ_AUTHOR,
+ READ_KEYWORDS,
+ READ_COMMENTS,
+ READ_STATS,
+ READ_CREATED,
+ READ_FILE_OTHER
+ } tag_type;
+
+typedef struct {
+ GHashTable *metadata;
+ tag_type current;
+} ODTParseInfo;
+
+static void start_element_handler (GMarkupParseContext *context,
+ const gchar *element_name,
+ const gchar **attribute_names,
+ const gchar **attribute_values,
+ gpointer user_data,
+ GError **error);
+
+static void end_element_handler (GMarkupParseContext *context,
+ const gchar *element_name,
+ gpointer user_data,
+ GError **error);
+
+static void text_handler (GMarkupParseContext *context,
+ const gchar *text,
+ gsize text_len,
+ gpointer user_data,
+ GError **error);
+
+void tracker_metadata_extract_oasis (gchar *filename, GHashTable *metadata)
+{
+
+ gchar *argv[5];
+ gchar *xml;
+ ODTParseInfo info = { metadata, -1 };
+
+ argv[0] = g_strdup ("unzip");
+ argv[1] = g_strdup ("-p");
+ argv[2] = g_strdup (filename);
+ argv[3] = g_strdup ("meta.xml");
+ argv[4] = NULL;
+
+ if(g_spawn_sync (NULL,
+ argv,
+ NULL,
+ G_SPAWN_SEARCH_PATH | G_SPAWN_STDERR_TO_DEV_NULL,
+ NULL,
+ NULL,
+ &xml,
+ NULL,
+ NULL,
+ NULL)) {
+
+ GMarkupParseContext *context;
+ GMarkupParser parser = {
+ start_element_handler,
+ end_element_handler,
+ text_handler,
+ NULL,
+ NULL
+ };
+
+ context = g_markup_parse_context_new (&parser, 0, &info, NULL);
+ g_markup_parse_context_parse (context, xml, -1, NULL);
+
+ g_markup_parse_context_free (context);
+ g_free (xml);
+ }
+
+ g_free (argv[3]);
+ g_free (argv[2]);
+ g_free (argv[1]);
+ g_free (argv[0]);
+
+}
+
+void start_element_handler (GMarkupParseContext *context,
+ const gchar *element_name,
+ const gchar **attribute_names,
+ const gchar **attribute_values,
+ gpointer user_data,
+ GError **error)
+{
+ if(strcmp(element_name, "dc:title") == 0) {
+ ((ODTParseInfo *)user_data)->current = READ_TITLE;
+ }
+ else if(strcmp(element_name, "dc:subject") == 0) {
+ ((ODTParseInfo *)user_data)->current = READ_SUBJECT;
+ }
+ else if(strcmp(element_name, "dc:creator") == 0) {
+ ((ODTParseInfo *)user_data)->current = READ_AUTHOR;
+ }
+ else if(strcmp(element_name, "meta:keyword") == 0) {
+ ((ODTParseInfo *)user_data)->current = READ_KEYWORDS;
+ }
+ else if(strcmp(element_name, "dc:description") == 0) {
+ ((ODTParseInfo *)user_data)->current = READ_COMMENTS;
+ }
+ else if(strcmp(element_name, "meta:document-statistic") == 0) {
+ GHashTable *metadata = ((ODTParseInfo *)user_data)->metadata;
+ const gchar **a, **v;
+ for(a=attribute_names,v=attribute_values; *a; ++a,++v) {
+ if (strcmp (*a, "meta:word-count") == 0) {
+ g_hash_table_insert (metadata,
+ g_strdup("Doc.WordCount"), g_strdup (*v));
+ }
+ else if (strcmp (*a, "meta:page-count") == 0) {
+ g_hash_table_insert (metadata,
+ g_strdup("Doc.PageCount"), g_strdup (*v));
+ }
+ }
+ ((ODTParseInfo *)user_data)->current = READ_STATS;
+ }
+ else if(strcmp(element_name, "meta:creation-date") == 0) {
+ ((ODTParseInfo *)user_data)->current = READ_CREATED;
+ }
+ else if(strcmp(element_name, "meta:generator") == 0) {
+ ((ODTParseInfo *)user_data)->current = READ_FILE_OTHER;
+ }
+ else {
+ ((ODTParseInfo *)user_data)->current = -1;
+ }
+}
+
+void end_element_handler (GMarkupParseContext *context,
+ const gchar *element_name,
+ gpointer user_data,
+ GError **error)
+{
+ ((ODTParseInfo *)user_data)->current = -1;
+}
+
+void text_handler (GMarkupParseContext *context,
+ const gchar *text,
+ gsize text_len,
+ gpointer user_data,
+ GError **error)
+{
+ GHashTable *metadata = ((ODTParseInfo *)user_data)->metadata;
+
+ switch(((ODTParseInfo *)user_data)->current) {
+ case READ_TITLE:
+ g_hash_table_insert (metadata, g_strdup("Doc.Title"), g_strdup (text));
+ break;
+ case READ_SUBJECT:
+ g_hash_table_insert (metadata, g_strdup("Doc.Subject"), g_strdup (text));
+ break;
+ case READ_AUTHOR:
+ g_hash_table_insert (metadata, g_strdup("Doc.Author"), g_strdup (text));
+ break;
+ case READ_KEYWORDS: {
+ gchar *keywords;
+ if (keywords = g_hash_table_lookup (metadata, "Doc.Keywords")) {
+ g_hash_table_replace (metadata, "Doc.Keywords",
+ g_strconcat (keywords, ",", text, NULL));
+ }
+ else {
+ g_hash_table_insert (metadata, g_strdup("Doc.Keywords"), g_strdup (text));
+ }
+ }
+ break;
+ case READ_COMMENTS:
+ g_hash_table_insert (metadata, g_strdup("Doc.Comments"), g_strdup (text));
+ break;
+ case READ_CREATED:
+ g_hash_table_insert (metadata, g_strdup("Doc.Created"), g_strdup (text));
+ break;
+ case READ_FILE_OTHER:
+ g_hash_table_insert (metadata, g_strdup("File.Other"), g_strdup (text));
+ break;
+ }
+}
--- /dev/null 2006-08-05 19:53:54.000000000 -0400
+++ src/trackerd/tracker-metadata-abw.c 2006-09-24 20:44:08.000000000 -0400
@@ -0,0 +1,47 @@
+
+#include <string.h>
+#include <stdio.h>
+#include <glib.h>
+
+void tracker_metadata_extract_abw (gchar *filename, GHashTable *metadata)
+{
+ FILE *f;
+ gchar *line;
+ gsize length = 0;
+
+ if(f = fopen (filename, "r")) {
+ line = NULL;
+ getline (&line, &length, f);
+ while (!feof (f)) {
+ if (g_str_has_suffix (line, "</m>\n")) {
+ line[strlen(line) - 5] = '\0';
+ }
+ if (g_str_has_prefix (line, "<m key=\"dc.title\">")) {
+ g_hash_table_insert (metadata,
+ g_strdup ("Doc.Title"), g_strdup (line+18));
+ }
+ else if (g_str_has_prefix (line, "<m key=\"dc.subject\">")) {
+ g_hash_table_insert (metadata,
+ g_strdup ("Doc.Subject"), g_strdup (line+20));
+ }
+ else if (g_str_has_prefix (line, "<m key=\"dc.creator\">")) {
+ g_hash_table_insert (metadata,
+ g_strdup ("Doc.Author"), g_strdup (line+20));
+ }
+ else if (g_str_has_prefix (line, "<m key=\"abiword.keywords\">")) {
+ g_hash_table_insert (metadata,
+ g_strdup ("Doc.Keywords"), g_strdup (line+26));
+ }
+ else if (g_str_has_prefix (line, "<m key=\"dc.description\">")) {
+ g_hash_table_insert (metadata,
+ g_strdup ("Doc.Comments"), g_strdup (line+24));
+ }
+ g_free (line);
+ line = NULL;
+ getline (&line, &length, f);
+ }
+ g_free (line);
+ }
+ fclose (f);
+}
+
_______________________________________________
tracker-list mailing list
[email protected]
http://mail.gnome.org/mailman/listinfo/tracker-list