hi all-
I came up with this first pass at metadata-extraction for manpages.
Currently it adds the following keys:
Doc:Title - the manpage name, i.e. ls, cat
Doc:Subject - based on the formal descriptions of each section from the
man(1) page, i.e. "Executable Programs", "Library Calls". This part
needs i18n love.
Man:Section - this a custom key referring to the manpage section number
(1-9, plus wierd ones like 3ssl and 3pm). Is it legal/kosher to add my
own key like that?
Doc:Description - taken from the 'NAME' section of the manpage,
specifically the text after the '-'.
Running tracker-extract manually over various manpages seems to work.
However, trackerd doesn't seem to be picking up the files. Is there
something i need to do in trackerd for it to index files of type
text/troff?
--
.~.
Michael Frank /v\
[EMAIL PROTECTED] // \\
/( )\
GPG Fingerprint: ^`-'^
2A44 DF32 91A5 ADA9 0E86 4F65 4051 870D 8B51 6EE0
Index: tracker-extract.c
===================================================================
--- tracker-extract.c (revision 542)
+++ tracker-extract.c (working copy)
@@ -52,6 +52,7 @@
void tracker_extract_totem (gchar *, GHashTable *);
void tracker_extract_oasis (gchar *, GHashTable *);
void tracker_extract_ps (gchar *, GHashTable *);
+void tracker_extract_manpage (gchar *, GHashTable *);
#ifdef HAVE_POPPLER
void tracker_extract_pdf (gchar *, GHashTable *);
#endif
@@ -84,6 +85,7 @@
/* Document extractors */
{ "application/vnd.oasis.opendocument.*", tracker_extract_oasis },
{ "application/postscript", tracker_extract_ps },
+ { "text/troff", tracker_extract_manpage },
#ifdef HAVE_POPPLER
{ "application/pdf", tracker_extract_pdf },
#endif
Index: tracker-extract-manpage.c
===================================================================
--- tracker-extract-manpage.c (revision 0)
+++ tracker-extract-manpage.c (revision 0)
@@ -0,0 +1,129 @@
+/* Tracker Extract - extracts embedded metadata from manpages
+ * Copyright (C) 2007, Michael Frank ([EMAIL PROTECTED])
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+
+#include "config.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <glib.h>
+#include <glib/gstdio.h>
+
+void tracker_extract_manpage (gchar *filename, GHashTable *metadata)
+{
+ FILE *fp;
+ gchar buffer[256];
+ gboolean parsed_title = FALSE, parsed_name_section = FALSE;
+
+ fp = g_fopen (filename, "r");
+ if (fp == NULL)
+ return;
+
+ while (fgets (buffer, 256, fp)) {
+ if (!parsed_title && !strncmp (buffer, ".TH", 3)) {
+ gchar *token = strtok (buffer + 3, " \t\v\r\f\n");
+
+ if (token != NULL)
+ g_hash_table_insert (metadata, g_strdup ("Doc:Title"), g_strdup (token));
+ token = strtok (NULL, " \t\v\r\f\n");
+ if (token != NULL) {
+ guint64 section_num = 0;
+ gchar *end_ptr = NULL;
+
+ /* some man pages quote the section number */
+ if (token[0] == '\"') {
+ gchar *end_quote = strchr (++token, '\"');
+ if (end_quote)
+ *end_quote = '\0';
+ }
+ g_hash_table_insert (metadata, g_strdup ("Man:Section"), g_strdup (token));
+ section_num = g_ascii_strtoull (token, &end_ptr, 10);
+ if (end_ptr > token) {
+ /* TODO: these strings need to be translated */
+ switch (section_num) {
+ case 1:
+ g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("Executable Programs And Shell Commands"));
+ break;
+ case 2:
+ g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("System Calls"));
+ break;
+ case 3:
+ g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("Library Calls"));
+ break;
+ case 4:
+ g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("Special Files"));
+ break;
+ case 5:
+ g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("File Formats And Conventions"));
+ break;
+ case 6:
+ g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("Games"));
+ break;
+ case 7:
+ g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("Miscellaneous"));
+ break;
+ case 8:
+ g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("System Administration Commands"));
+ break;
+ case 9:
+ g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("Kernel Routines"));
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ /* TODO: is this worth it? the date is pretty fuzzy and kinda non-stanard
+ token = strtok (NULL, " \t\v\r\f\n");
+ if (token != NULL) {
+ GDate cdate;
+ struct tm ctm;
+ g_date_set_parse (&cdate, token);
+ if (g_date_valid (&cdate)) {
+ g_date_to_struct_tm (&cdate, &ctm);
+ g_hash_table_insert (metadata, g_strdup ("Doc:Created"), asctime (&ctm));
+ }
+ }
+ */
+ parsed_title = TRUE;
+ }
+
+ if (!parsed_name_section && !strncmp (buffer, ".SH NAME", 8)) {
+ GString *name = g_string_new (NULL);
+ gchar *desc;
+ while (fgets (buffer, 256, fp)) {
+ if (!strncmp (buffer, ".SH", 3))
+ break;
+ name = g_string_append (name, buffer);
+ }
+ desc = strstr (name->str, "\\-");
+ if (desc)
+ desc += 2;
+ else
+ desc = name->str;
+ g_hash_table_insert (metadata, g_strdup ("Doc:Description"), g_strdup (desc));
+ g_string_free (name, TRUE);
+ parsed_name_section = TRUE;
+ }
+ }
+
+ if (ferror (fp))
+ g_debug ("error parsing manpage '%s'", filename);
+ fclose (fp);
+}
Index: Makefile.am
===================================================================
--- Makefile.am (revision 542)
+++ Makefile.am (working copy)
@@ -33,6 +33,7 @@
tracker-extract-imagemagick.c \
tracker-extract-mplayer.c \
tracker-extract-totem.c \
+ tracker-extract-manpage.c \
$(video_sources)
tracker_extract_LDADD = $(GLIB2_LIBS) \
_______________________________________________
tracker-list mailing list
[email protected]
http://mail.gnome.org/mailman/listinfo/tracker-list