Author: david
Date: Thu Jul  5 15:17:37 2012
New Revision: 11860

Log:
Save PDF text as a property of the digital object. Refs issue #1252

Modified:
   trunk/lib/model/QubitDigitalObject.php

Modified: trunk/lib/model/QubitDigitalObject.php
==============================================================================
--- trunk/lib/model/QubitDigitalObject.php      Thu Jul  5 14:06:41 2012        
(r11859)
+++ trunk/lib/model/QubitDigitalObject.php      Thu Jul  5 15:17:37 2012        
(r11860)
@@ -962,6 +962,12 @@
             $this->createReferenceImage($connection);
             $this->createThumbnail($connection);
           }
+
+          // Extract text if possible
+          if (self::canExtractText($this->mimeType))
+          {
+            $this->extractText($connection);
+          }
         }
 
         break;
@@ -1522,6 +1528,22 @@
   }
 
   /**
+   * Test if current mimeType allows text extraction
+   *
+   * @param string mime-type
+   * @return boolean true if extraction is supported
+   */
+  public static function canExtractText($mimeType)
+  {
+    if ('application/pdf' == $mimeType)
+    {
+      return true;
+    }
+
+    return false;
+  }
+
+  /**
    * Return true if derived mimeType is "image/*"
    *
    * @param string $filename
@@ -1696,6 +1718,40 @@
 
       return $derivative;
     }
+    $originalFullPath = $this->getAbsolutePath();
+    list($originalNameNoExtension) = explode('.', $this->getName());
+
+    switch ($usageId)
+    {
+      case QubitTerm::REFERENCE_ID:
+        $derivativeName = $originalNameNoExtension.'_'.$usageId.'.flv';
+        $derivativeFullPath = 
sfConfig::get('sf_web_dir').$this->getPath().$derivativeName;
+        self::convertVideoToFlash($originalFullPath, $derivativeFullPath);
+        break;
+      case QubitTerm::THUMBNAIL_ID:
+      default:
+        $extension = '.'.self::THUMB_EXTENSION;
+        $derivativeName = $originalNameNoExtension.'_'.$usageId.$extension;
+        $derivativeFullPath = 
sfConfig::get('sf_web_dir').$this->getPath().$derivativeName;
+        $maxDimensions = self::getImageMaxDimensions($usageId);
+        self::convertVideoToThumbnail($originalFullPath, $derivativeFullPath, 
$maxDimensions[0], $maxDimensions[1]);
+    }
+
+    if (file_exists($derivativeFullPath) && 0 < ($byteSize = 
filesize($derivativeFullPath)))
+    {
+      $derivative = new QubitDigitalObject;
+      $derivative->setPath($this->getPath());
+      $derivative->setName($derivativeName);
+      $derivative->parentId = $this->id;
+      $derivative->setByteSize($byteSize);
+      $derivative->usageId = $usageId;
+      $derivative->setMimeAndMediaType();
+      $derivative->createDerivatives = false;
+      $derivative->indexOnSave = false;
+      $derivative->save($connection);
+
+      return $derivative;
+    }
   }
 
   /**
@@ -1830,6 +1886,53 @@
     return file_get_contents($tmpFilePath);
   }
 
+
+  /*
+   * -----------------------------------------------------------------------
+   * TEXT METHODS
+   * -----------------------------------------------------------------------
+   */
+
+  public static function hasPdfToText()
+  {
+    exec('which pdftotext', $output, $status);
+
+    return 0 == $status && 0 < count($output);
+  }
+
+  /**
+   * Create a thumbnail derivative for the current digital object
+   *
+   * @return QubitDigitalObject
+   */
+  public function extractText($connection = null)
+  {
+    // Test for FFmpeg library
+    if (!self::hasPdfToText())
+    {
+      return;
+    }
+
+    $command = sprintf('pdftotext %s -', $this->getAbsolutePath());
+    exec($command, $output, $status);
+
+    if (0 == $status && 0 < count($output))
+    {
+      $text = implode(PHP_EOL, $output);
+
+      $property = new QubitProperty;
+      $property->objectId = $this->id;
+      $property->name = 'text';
+      $property->scope = 'Text extracted from orginal file via pdftotext';
+      $property->value = $text;
+      $property->indexOnSave = false;
+
+      $property->save($connection);
+
+      return $text;
+    }
+  }
+
   /* -----------------------------------------------------------------------
    * CHECKSUMS
    * --------------------------------------------------------------------- */

-- 
You received this message because you are subscribed to the Google Groups 
"Qubit Toolkit Commits" group.
To post to this group, send email to [email protected].
To unsubscribe from this group, send email to 
[email protected].
For more options, visit this group at 
http://groups.google.com/group/qubit-commits?hl=en.

Reply via email to