On Mon, 2007-06-18 at 12:49 -0700, Jason Kivlighn wrote:
> Hi,
> 
> imagemagick: Uses 'convert filename xmp:-' to output an image's embedded
> XMP.  This works for at least JPEG and TIFF files.  For JPEGs, however,
> Imagemagick outputs the namespace and XMP, seperated by \0.  I'm not
> sure how I can handle this, without simply assuming that 'convert'
> returned two null-terminated strings.  Nevertheless, this extracts the
> XMP from TIFF files.
> 
> msoffice: Extends the msoffice extractor to also parse the
> DocumentSummeryInformation infile, which contains user-defined metadata,
> along with license metadata embedded by the MSOffice Creative Commons Add-in
> 
> pdf: Extends the pdf extractor to read a PDF's metadata stream and parse
> it as XMP.  I'm still awaiting poppler extending the glib bindings to
> allow reading the metadata stream.  Until then, it will simply never
> find the metadata stream and go on without error.
> 
> png: Adds a check for the XML:com:adobe:xmp iTXt field, and parses it as
> XMP.
> 
> html: Adds a new html parser using libxml2.  Parses the document,
> checking for RDFa licenses.  It also checks for other basic HTML
> properties like title and author.
> 
> There's also several XML formats I'd like to parse for license data,
> particularly SVG and SMIL.  Would this be do-able, and if so, how should
> I go about it?  Write new extractors for each format or is this too much
> overhead?  These could use GMarkupParse, rather than bringing in libxml2
> like the HTML parser.
> 
> Cheers,
> Jason

Nathan, what do you think about these as well?

Jon

> plain text document attachment (tracker-imagemagick-extract-xmp.patch)
> Index: src/tracker-extract/tracker-extract-imagemagick.c
> ===================================================================
> --- src/tracker-extract/tracker-extract-imagemagick.c (revision 598)
> +++ src/tracker-extract/tracker-extract-imagemagick.c (working copy)
> @@ -35,7 +35,7 @@
>       gint           exit_status;
>  
>       /* imagemagick crashes trying to extract from xcf files */
> -     if (g_str_has_suffix (filename, '.xcf')) {
> +     if (g_str_has_suffix (filename, ".xcf")) {
>               return;
>       }
>  
> @@ -60,5 +60,16 @@
>                       g_hash_table_insert (metadata, g_strdup 
> ("Image:Comments"), g_strdup (g_strescape (lines[2], "")));
>               }
>       }
> +
> +     gchar         *xmp;
> +     argv[0] = g_strdup ("convert");
> +     argv[1] = g_strdup (filename);
> +     argv[2] = g_strdup ("xmp:-");
> +     argv[3] = NULL;
> +
> +     if (tracker_spawn (argv, 10, &xmp, &exit_status)) {
> +             if (exit_status == EXIT_SUCCESS) {
> +                     tracker_read_xmp(xmp,strlen(xmp),metadata);
> +             }
> +     }
>  }
> -
> plain text document attachment
> (tracker-msoffice-extract-license.patch)
> Index: src/tracker-extract/tracker-extract-msoffice.c
> ===================================================================
> --- src/tracker-extract/tracker-extract-msoffice.c    (revision 598)
> +++ src/tracker-extract/tracker-extract-msoffice.c    (working copy)
> @@ -118,7 +118,26 @@
>       }
>  }
>  
> +static void
> +doc_metadata_cb (gpointer key, gpointer value, gpointer user_data)
> +{
> +     gchar           *name;
> +     GsfDocProp      *property;
> +     GHashTable      *metadata;
> +     GValue const    *val;
>  
> +     name = (gchar *) key;
> +     property = (GsfDocProp *) value;
> +     metadata = (GHashTable *) user_data;
> +
> +     val = gsf_doc_prop_get_val (property);
> +
> +     if (strcmp (name, "CreativeCommons_LicenseURL") == 0) {
> +             add_gvalue_in_hash_table (metadata, "File:License", val);
> +     }
> +}
> +
> +
>  void
>  tracker_extract_msoffice (gchar *filename, GHashTable *metadata)
>  {
> @@ -145,25 +164,37 @@
>       }
>  
>       stream = gsf_infile_child_by_name (infile, "\05SummaryInformation");
> -     g_object_unref (G_OBJECT (infile));
> -
> -     if (!stream) {
> -             gsf_shutdown ();
> -             return;
> +     if (stream) {
> +             md = gsf_doc_meta_data_new ();
> +     
> +             if (gsf_msole_metadata_read (stream, md)) {
> +                     gsf_shutdown ();
> +                     return;
> +             }
> +     
> +             gsf_doc_meta_data_foreach (md, metadata_cb, metadata);
> +     
> +             g_object_unref (G_OBJECT (md));
> +             g_object_unref (G_OBJECT (stream));
>       }
>  
> -     md = gsf_doc_meta_data_new ();
> -
> -     if (gsf_msole_metadata_read (stream, md)) {
> -             gsf_shutdown ();
> -             return;
> +     stream = gsf_infile_child_by_name (infile, 
> "\05DocumentSummaryInformation");
> +     if (stream) {
> +             md = gsf_doc_meta_data_new ();
> +     
> +             if (gsf_msole_metadata_read (stream, md)) {
> +                     gsf_shutdown ();
> +                     return;
> +             }
> +     
> +             gsf_doc_meta_data_foreach (md, doc_metadata_cb, metadata);
> +     
> +             g_object_unref (G_OBJECT (md));
> +             g_object_unref (G_OBJECT (stream));
>       }
>  
> -     gsf_doc_meta_data_foreach (md, metadata_cb, metadata);
> +     g_object_unref (G_OBJECT (infile));
>  
> -     g_object_unref (G_OBJECT (md));
> -     g_object_unref (G_OBJECT (stream));
> -
>       gsf_shutdown ();
>  }
>  
> plain text document attachment (tracker-pdf-extract-xmp.patch)
> Index: src/tracker-extract/tracker-extract-pdf.c
> ===================================================================
> --- src/tracker-extract/tracker-extract-pdf.c (revision 598)
> +++ src/tracker-extract/tracker-extract-pdf.c (working copy)
> @@ -26,6 +26,8 @@
>  #include <string.h>
>  #include <glib.h>
>  
> +#include "tracker-extract.h"
> +
>  void tracker_extract_pdf (gchar *filename, GHashTable *metadata)
>  {
>       PopplerDocument *document;
> @@ -34,6 +36,7 @@
>       gchar           *author;
>       gchar           *subject;
>       gchar           *keywords;
> +     gchar           *metadata_xml;
>       GTime            creation_date;
>       GError          *error = NULL;
>  
> @@ -50,6 +53,7 @@
>               "subject", &subject,
>               "keywords", &keywords,
>               "creation-date", &creation_date,
> +             "metadata", &metadata_xml,
>               NULL);
>  
>       if (title && strlen (title))
> @@ -71,10 +75,15 @@
>       g_hash_table_insert (metadata, g_strdup ("Doc:PageCount"),
>               g_strdup_printf ("%d", poppler_document_get_n_pages 
> (document)));
>  
> +     if ( metadata_xml ) {
> +             tracker_read_xmp (metadata_xml,strlen(metadata_xml),metadata);
> +     }
> +
>       g_free (title);
>       g_free (author);
>       g_free (subject);
>       g_free (keywords);
> +     g_free (metadata_xml);
>       g_object_unref (document);
>  }
>  
> plain text document attachment (tracker-png-extract-xmp.patch)
> Index: src/tracker-extract/tracker-extract-png.c
> ===================================================================
> --- src/tracker-extract/tracker-extract-png.c (revision 598)
> +++ src/tracker-extract/tracker-extract-png.c (working copy)
> @@ -20,6 +20,8 @@
>  
>  #include "config.h"
>  
> +#include "tracker-extract.h"
> +
>  #include <stdio.h>
>  #include <glib.h>
>  #include <png.h>
> @@ -79,15 +81,23 @@
>                                            g_strdup_printf ("%ld", height));
>               }
>  
> -
>               if (png_get_text (png_ptr, info_ptr, &text_ptr, &num_text) > 0) 
> {
> -                     for (i = 0; i < num_text; i++) {
> -                             for (j=0; tagmap[j].type; j++) {
> -                                     if (strcasecmp (tagmap[j].name,  
> text_ptr[i].key) == 0) {
> -                                             if (text_ptr[i].text && strlen 
> (text_ptr[i].text) > 0) {
> -                                                     g_hash_table_insert 
> (metadata, g_strdup (tagmap[j].type), g_strdup (text_ptr[i].text));
> +                     for (i = 0; i < num_text; i++) {
> +                             if ( text_ptr[i].key != NULL ) {
> +                                     #if defined(HAVE_EXEMPI) && 
> defined(PNG_iTXt_SUPPORTED)
> +                                     if 
> (strcmp("XML:com.adobe.xmp",text_ptr[i].key) == 0) {
> +                                             
> tracker_read_xmp(text_ptr[i].text,text_ptr[i].itxt_length,metadata);
> +                                             continue;
> +                                     }
> +                                     #endif
> +     
> +                                     for (j=0; tagmap[j].type; j++) {
> +                                             if (strcasecmp (tagmap[j].name, 
>  text_ptr[i].key) == 0) {
> +                                                     if (text_ptr[i].text && 
> strlen (text_ptr[i].text) > 0) {
> +                                                             
> g_hash_table_insert (metadata, g_strdup (tagmap[j].type), g_strdup 
> (text_ptr[i].text));
> +                                                     }
> +                                                     break;
>                                               }
> -                                             break;
>                                       }
>                               }
>                       }
> plain text document attachment (tracker-extract-html.patch)
> Index: src/tracker-extract/tracker-extract.c
> ===================================================================
> --- src/tracker-extract/tracker-extract.c     (revision 598)
> +++ src/tracker-extract/tracker-extract.c     (working copy)
> @@ -52,6 +52,9 @@
>  void tracker_extract_totem   (gchar *, GHashTable *);
>  void tracker_extract_oasis   (gchar *, GHashTable *);
>  void tracker_extract_ps              (gchar *, GHashTable *);
> +#ifdef HAVE_LIBXML2
> +void tracker_extract_html            (gchar *, GHashTable *);
> +#endif
>  #ifdef HAVE_POPPLER
>  void tracker_extract_pdf     (gchar *, GHashTable *);
>  #endif
> @@ -84,6 +87,10 @@
>       /* Document extractors */
>       { "application/vnd.oasis.opendocument.*",       tracker_extract_oasis   
>         },
>       { "application/postscript",                     tracker_extract_ps      
>         },
> +#ifdef HAVE_LIBXML2
> +     { "text/html",                                                  
> tracker_extract_html    },
> +     { "application/xhtml+xml",      tracker_extract_html    },
> +#endif
>  #ifdef HAVE_POPPLER
>       { "application/pdf",                            tracker_extract_pdf     
>         },
>  #endif
> Index: src/tracker-extract/Makefile.am
> ===================================================================
> --- src/tracker-extract/Makefile.am   (revision 598)
> +++ src/tracker-extract/Makefile.am   (working copy)
> @@ -5,6 +5,7 @@
>       $(LIBGSF_CFLAGS)                        \
>       $(LIBGSF_CFLAGS)                        \
>       $(GSTREAMER_CFLAGS)                     \
> +     $(LIBXML2_CFLAGS)                       \
>       $(XINE_CFLAGS)
>  
>  bin_PROGRAMS = tracker-extract
> @@ -33,6 +34,7 @@
>       tracker-extract-imagemagick.c           \
>       tracker-extract-mplayer.c               \
>       tracker-extract-totem.c                 \
> +     tracker-extract-html.c                  \
>       $(video_sources)
>  
>  tracker_extract_LDADD = $(GLIB2_LIBS)                \
> @@ -41,4 +43,5 @@
>       $(LIBEXIF_LIBS)                         \
>       $(LIBGSF_LIBS)                          \
>       $(GSTREAMER_LIBS)                       \
> +     $(LIBXML2_LIBS)                         \
>       $(XINE_LIBS)
> Index: configure.ac
> ===================================================================
> --- configure.ac      (revision 598)
> +++ configure.ac      (working copy)
> @@ -605,8 +605,25 @@
>     [ AC_DEFINE(IOPRIO_SUPPORT,[],[Define ioprio support]) ioprio_support=yes 
> ])
>  AC_MSG_RESULT([$ioprio_support])
>  
> -#####################################################
> +##################################################################
> +# check for libxml2
> +##################################################################
>  
> +LIBXML2_REQUIRED=0.6
> +
> +AC_ARG_ENABLE(libxml2, AC_HELP_STRING([--disable-libxml2],[Disable HTML/XML 
> extractors (full-text will still be available)]),,[enable_libxml2=yes])
> +if test "x$enable_libxml2" = "xyes"; then
> +     PKG_CHECK_MODULES(LIBXML2, [
> +             libxml-2.0 >= $LIBXML2_REQUIRED],
> +             [have_libxml2=yes] , [have_libxml2=no])
> +     AC_SUBST(LIBXML2_CFLAGS)
> +     AC_SUBST(LIBXML2_LIBS)
> +else
> +     have_libxml2="no (disabled)"
> +fi
> +AM_CONDITIONAL(HAVE_LIBXML2, test "$have_libxml2" = "yes")
> +test "$have_libxml2" = "yes" && AC_DEFINE(HAVE_LIBXML2, [], [Define if we 
> have libxml2])
> +
>  AC_CONFIG_FILES([
>       Makefile
>       tracker.pc
> @@ -675,6 +692,7 @@
>       exif (jpeg):                            $have_libexif
>       gsf:                                    $have_libgsf
>       video files:                            $videos_are_handled 
> ($videos_handler)
> +     xml/html formats:                       $have_libxml2
>  
>  "
>  if test "x$enable_external_sqlite" = "xyes"; then
> Index: src/tracker-extract/tracker-extract-html.c
> ===================================================================
> --- src/tracker-extract/tracker-extract-html.c        (revision 0)
> +++ src/tracker-extract/tracker-extract-html.c        (revision 0)
> @@ -0,0 +1,161 @@
> +/* Tracker Extract - extracts embedded metadata from files
> + * Copyright (C) 2007, Jason Kivlighn ([EMAIL PROTECTED])
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public
> + * License along with this program; if not, write to the
> + * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
> + * Boston, MA  02110-1301, USA.
> + */
> +
> +#include "config.h"
> +
> +#ifdef HAVE_LIBXML2
> +
> +#include <string.h>
> +#include <glib.h>
> +#include <libxml/HTMLparser.h>
> +
> +typedef enum {
> +             READ_TITLE,
> +     } tag_type;
> +
> +typedef struct {
> +     GHashTable *metadata;
> +     tag_type current;
> +} HTMLParseInfo;
> +
> +gboolean
> +has_attribute( const xmlChar ** atts, const char *attr, const char*val )
> +{
> +     int i;
> +     for ( i = 0; atts[i]; i+=2 )
> +     {
> +             if ( strcmp((char*)atts[i],attr) == 0 ) {
> +                     if ( !val || strcmp((char*)atts[i+1],val) == 0 ) {
> +                             return TRUE;
> +                     }
> +             }
> +     }
> +     return FALSE;
> +}
> +
> +const xmlChar *
> +lookup_attribute( const xmlChar **atts, const char *attr )
> +{
> +     int i;
> +     for ( i = 0; atts[i]; i+=2 )
> +     {
> +             if ( strcmp((char*)atts[i],attr) == 0 ) {
> +                     return atts[i+1];
> +             }
> +     }
> +
> +     return NULL;
> +}
> +
> +void
> +startElement (void * info, const xmlChar * name, const xmlChar ** atts)
> +{
> +     /* Look for RDFa triple describing the license */
> +     if ( strcmp((char*)name,"a") == 0 ) {
> +             /* This tag is a license.  Ignore, however, if it is referring 
> to another document */
> +             if ( has_attribute(atts,"rel","license") && 
> !has_attribute(atts,"about",NULL) ) {
> +                     const xmlChar *href = lookup_attribute(atts,"href");
> +                     if ( href ) {
> +                             g_hash_table_insert (((HTMLParseInfo 
> *)info)->metadata, g_strdup ("File:License"),
> +                                                  g_strdup( (char*)href ));
> +                     }
> +             }
> +     } else if ( strcmp((char*)name,"title") == 0 ) {
> +             ((HTMLParseInfo *)info)->current = READ_TITLE;
> +     } else if ( strcmp((char*)name,"meta") == 0 ) {
> +             if ( has_attribute(atts,"name","Author") ) {
> +                     const xmlChar *author = 
> lookup_attribute(atts,"content");
> +                     if ( author ) {
> +                             g_hash_table_insert (((HTMLParseInfo 
> *)info)->metadata, g_strdup ("Doc:Author"),
> +                                                  g_strdup( (char*)author ));
> +                     }
> +             }
> +             if ( has_attribute(atts,"name","DC.Description") ) {
> +                     const xmlChar *desc = lookup_attribute(atts,"content");
> +                     if ( desc ) {
> +                             g_hash_table_insert (((HTMLParseInfo 
> *)info)->metadata, g_strdup ("Doc:Comments"),
> +                                                  g_strdup( (char*)desc ));
> +                     }
> +             }
> +     }
> +}
> +
> +void
> +characters(void * info, const xmlChar * ch, int len)
> +{
> +     switch(((HTMLParseInfo *)info)->current) {
> +             case READ_TITLE:
> +                             g_hash_table_insert (((HTMLParseInfo 
> *)info)->metadata, g_strdup ("Doc:Title"),
> +                                                  g_strdup( (char*)ch ));
> +                             break;
> +             default: break;
> +     }
> +
> +     ((HTMLParseInfo *)info)->current = -1;
> +}
> +
> +void tracker_extract_html (gchar* filename, GHashTable *metadata)
> +{
> +     xmlSAXHandler SAXHandlerStruct = {
> +                     NULL, /* internalSubset */
> +                     NULL, /* isStandalone */
> +                     NULL, /* hasInternalSubset */
> +                     NULL, /* hasExternalSubset */
> +                     NULL, /* resolveEntity */
> +                     NULL, /* getEntity */
> +                     NULL, /* entityDecl */
> +                     NULL, /* notationDecl */
> +                     NULL, /* attributeDecl */
> +                     NULL, /* elementDecl */
> +                     NULL, /* unparsedEntityDecl */
> +                     NULL, /* setDocumentLocator */
> +                     NULL, /* startDocument */
> +                     NULL, /* endDocument */
> +                     startElement, /* startElement */
> +                     NULL, /* endElement */
> +                     NULL, /* reference */
> +                     characters, /* characters */
> +                     NULL, /* ignorableWhitespace */
> +                     NULL, /* processingInstruction */
> +                     NULL, /* comment */
> +                     NULL, /* xmlParserWarning */
> +                     NULL, /* xmlParserError */
> +                     NULL, /* xmlParserError */
> +                     NULL, /* getParameterEntity */
> +                     NULL, /* cdataBlock */
> +                     NULL, /* externalSubset */
> +                     1,    /* initialized */
> +                     NULL, /* private */
> +                     NULL, /* startElementNsSAX2Func */
> +                     NULL, /* endElementNsSAX2Func */
> +                     NULL  /* xmlStructuredErrorFunc */
> +     };
> +
> +     HTMLParseInfo   info = { metadata, -1 };
> +
> +     htmlDocPtr doc;
> +     doc = htmlSAXParseFile(filename, NULL, &SAXHandlerStruct, &info);
> +     if ( doc ) {
> +             xmlFreeDoc(doc);
> +     }
> +}
> +
> +#else
> +#warning "Not building HTML metadata extractor."
> +#endif
> _______________________________________________
> tracker-list mailing list
> [EMAIL PROTECTED]
> http://mail.gnome.org/mailman/listinfo/tracker-list
-- 
Jon Phillips

San Francisco, CA
USA PH 510.499.0894
[EMAIL PROTECTED]
http://www.rejon.org

MSN, AIM, Yahoo Chat: kidproto
Jabber Chat: [EMAIL PROTECTED]
IRC: [EMAIL PROTECTED]

_______________________________________________
cc-devel mailing list
[email protected]
http://lists.ibiblio.org/mailman/listinfo/cc-devel

Reply via email to