hi all-

I came up with this first pass at metadata-extraction for manpages.
Currently it adds the following keys:

Doc:Title - the manpage name, i.e. ls, cat

Doc:Subject - based on the formal descriptions of each section from the
man(1) page, i.e. "Executable Programs", "Library Calls".  This part
needs i18n love.

Man:Section - this a custom key referring to the manpage section number
(1-9, plus wierd ones like 3ssl and 3pm).  Is it legal/kosher to add my
own key like that?

Doc:Description - taken from the 'NAME' section of the manpage,
specifically the text after the '-'.

Running tracker-extract manually over various manpages seems to work.
However, trackerd doesn't seem to be picking up the files.  Is there
something i need to do in trackerd for it to index files of type
text/troff?
-- 
                                      .~.
Michael Frank                         /v\
[EMAIL PROTECTED]             // \\
                                    /(   )\
GPG Fingerprint:                     ^`-'^
2A44 DF32 91A5 ADA9 0E86 4F65 4051 870D 8B51 6EE0
Index: tracker-extract.c
===================================================================
--- tracker-extract.c	(revision 542)
+++ tracker-extract.c	(working copy)
@@ -52,6 +52,7 @@
 void tracker_extract_totem	(gchar *, GHashTable *);
 void tracker_extract_oasis	(gchar *, GHashTable *);
 void tracker_extract_ps		(gchar *, GHashTable *);
+void tracker_extract_manpage	(gchar *, GHashTable *);
 #ifdef HAVE_POPPLER
 void tracker_extract_pdf	(gchar *, GHashTable *);
 #endif
@@ -84,6 +85,7 @@
 	/* Document extractors */
  	{ "application/vnd.oasis.opendocument.*",	tracker_extract_oasis		},
  	{ "application/postscript",			tracker_extract_ps		},
+    { "text/troff",                     tracker_extract_manpage },
 #ifdef HAVE_POPPLER
  	{ "application/pdf",				tracker_extract_pdf		},
 #endif
Index: tracker-extract-manpage.c
===================================================================
--- tracker-extract-manpage.c	(revision 0)
+++ tracker-extract-manpage.c	(revision 0)
@@ -0,0 +1,129 @@
+/* Tracker Extract - extracts embedded metadata from manpages
+ * Copyright (C) 2007, Michael Frank ([EMAIL PROTECTED])
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ */
+
+
+#include "config.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <glib.h>
+#include <glib/gstdio.h>
+
+void tracker_extract_manpage (gchar *filename, GHashTable *metadata)
+{
+    FILE *fp;
+    gchar buffer[256];
+    gboolean parsed_title = FALSE, parsed_name_section = FALSE;
+
+    fp = g_fopen (filename, "r");
+    if (fp == NULL)
+        return;
+
+    while (fgets (buffer, 256, fp)) {
+        if (!parsed_title && !strncmp (buffer, ".TH", 3)) {
+            gchar *token = strtok (buffer + 3, " \t\v\r\f\n");
+
+            if (token != NULL)
+		        g_hash_table_insert (metadata, g_strdup ("Doc:Title"), g_strdup (token));
+            token = strtok (NULL, " \t\v\r\f\n");
+            if (token != NULL) {
+                guint64 section_num = 0;
+                gchar *end_ptr = NULL;
+
+                /* some man pages quote the section number */
+                if (token[0] == '\"') {
+                    gchar *end_quote = strchr (++token, '\"');
+                    if (end_quote)
+                        *end_quote = '\0';
+                }
+		        g_hash_table_insert (metadata, g_strdup ("Man:Section"), g_strdup (token));
+                section_num = g_ascii_strtoull (token, &end_ptr, 10);
+                if (end_ptr > token) {
+                    /* TODO: these strings need to be translated */
+                    switch (section_num) {
+                        case 1:
+		                    g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("Executable Programs And Shell Commands"));
+                            break;
+                        case 2:
+		                    g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("System Calls"));
+                            break;
+                        case 3:
+		                    g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("Library Calls"));
+                            break;
+                        case 4:
+		                    g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("Special Files"));
+                            break;
+                        case 5:
+		                    g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("File Formats And Conventions"));
+                            break;
+                        case 6:
+		                    g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("Games"));
+                            break;
+                        case 7:
+		                    g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("Miscellaneous"));
+                            break;
+                        case 8:
+		                    g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("System Administration Commands"));
+                            break;
+                        case 9:
+		                    g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("Kernel Routines"));
+                            break;
+                        default:
+                            break;
+                    }
+                }
+            }
+            /* TODO: is this worth it?  the date is pretty fuzzy and kinda non-stanard
+            token = strtok (NULL, " \t\v\r\f\n");
+            if (token != NULL) {
+                GDate cdate;
+                struct tm ctm;
+                g_date_set_parse (&cdate, token);
+                if (g_date_valid (&cdate)) {
+                    g_date_to_struct_tm (&cdate, &ctm);
+		            g_hash_table_insert (metadata, g_strdup ("Doc:Created"), asctime (&ctm));
+                }
+            }
+            */
+            parsed_title = TRUE;
+        }
+
+        if (!parsed_name_section && !strncmp (buffer, ".SH NAME", 8)) {
+            GString *name = g_string_new (NULL);
+            gchar *desc;
+            while (fgets (buffer, 256, fp)) {
+                if (!strncmp (buffer, ".SH", 3))
+                    break;
+                name = g_string_append (name, buffer);
+            }
+            desc = strstr (name->str, "\\-");
+            if (desc)
+                desc += 2;
+            else
+                desc = name->str;
+		    g_hash_table_insert (metadata, g_strdup ("Doc:Description"), g_strdup (desc));
+            g_string_free (name, TRUE);
+            parsed_name_section = TRUE;
+        }
+    }
+
+    if (ferror (fp))
+        g_debug ("error parsing manpage '%s'", filename);
+	fclose (fp);
+}
Index: Makefile.am
===================================================================
--- Makefile.am	(revision 542)
+++ Makefile.am	(working copy)
@@ -33,6 +33,7 @@
 	tracker-extract-imagemagick.c		\
 	tracker-extract-mplayer.c		\
 	tracker-extract-totem.c			\
+	tracker-extract-manpage.c		\
 	$(video_sources)
 
 tracker_extract_LDADD = $(GLIB2_LIBS)		\
_______________________________________________
tracker-list mailing list
[email protected]
http://mail.gnome.org/mailman/listinfo/tracker-list

Reply via email to