Author: david
Date: Thu Jul 5 15:17:37 2012
New Revision: 11860
Log:
Save PDF text as a property of the digital object. Refs issue #1252
Modified:
trunk/lib/model/QubitDigitalObject.php
Modified: trunk/lib/model/QubitDigitalObject.php
==============================================================================
--- trunk/lib/model/QubitDigitalObject.php Thu Jul 5 14:06:41 2012
(r11859)
+++ trunk/lib/model/QubitDigitalObject.php Thu Jul 5 15:17:37 2012
(r11860)
@@ -962,6 +962,12 @@
$this->createReferenceImage($connection);
$this->createThumbnail($connection);
}
+
+ // Extract text if possible
+ if (self::canExtractText($this->mimeType))
+ {
+ $this->extractText($connection);
+ }
}
break;
@@ -1522,6 +1528,22 @@
}
/**
+ * Test if current mimeType allows text extraction
+ *
+ * @param string mime-type
+ * @return boolean true if extraction is supported
+ */
+ public static function canExtractText($mimeType)
+ {
+ if ('application/pdf' == $mimeType)
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
* Return true if derived mimeType is "image/*"
*
* @param string $filename
@@ -1696,6 +1718,40 @@
return $derivative;
}
+ $originalFullPath = $this->getAbsolutePath();
+ list($originalNameNoExtension) = explode('.', $this->getName());
+
+ switch ($usageId)
+ {
+ case QubitTerm::REFERENCE_ID:
+ $derivativeName = $originalNameNoExtension.'_'.$usageId.'.flv';
+ $derivativeFullPath =
sfConfig::get('sf_web_dir').$this->getPath().$derivativeName;
+ self::convertVideoToFlash($originalFullPath, $derivativeFullPath);
+ break;
+ case QubitTerm::THUMBNAIL_ID:
+ default:
+ $extension = '.'.self::THUMB_EXTENSION;
+ $derivativeName = $originalNameNoExtension.'_'.$usageId.$extension;
+ $derivativeFullPath =
sfConfig::get('sf_web_dir').$this->getPath().$derivativeName;
+ $maxDimensions = self::getImageMaxDimensions($usageId);
+ self::convertVideoToThumbnail($originalFullPath, $derivativeFullPath,
$maxDimensions[0], $maxDimensions[1]);
+ }
+
+ if (file_exists($derivativeFullPath) && 0 < ($byteSize =
filesize($derivativeFullPath)))
+ {
+ $derivative = new QubitDigitalObject;
+ $derivative->setPath($this->getPath());
+ $derivative->setName($derivativeName);
+ $derivative->parentId = $this->id;
+ $derivative->setByteSize($byteSize);
+ $derivative->usageId = $usageId;
+ $derivative->setMimeAndMediaType();
+ $derivative->createDerivatives = false;
+ $derivative->indexOnSave = false;
+ $derivative->save($connection);
+
+ return $derivative;
+ }
}
/**
@@ -1830,6 +1886,53 @@
return file_get_contents($tmpFilePath);
}
+
+ /*
+ * -----------------------------------------------------------------------
+ * TEXT METHODS
+ * -----------------------------------------------------------------------
+ */
+
+ public static function hasPdfToText()
+ {
+ exec('which pdftotext', $output, $status);
+
+ return 0 == $status && 0 < count($output);
+ }
+
+ /**
+ * Create a thumbnail derivative for the current digital object
+ *
+ * @return QubitDigitalObject
+ */
+ public function extractText($connection = null)
+ {
+ // Test for FFmpeg library
+ if (!self::hasPdfToText())
+ {
+ return;
+ }
+
+ $command = sprintf('pdftotext %s -', $this->getAbsolutePath());
+ exec($command, $output, $status);
+
+ if (0 == $status && 0 < count($output))
+ {
+ $text = implode(PHP_EOL, $output);
+
+ $property = new QubitProperty;
+ $property->objectId = $this->id;
+ $property->name = 'text';
+ $property->scope = 'Text extracted from orginal file via pdftotext';
+ $property->value = $text;
+ $property->indexOnSave = false;
+
+ $property->save($connection);
+
+ return $text;
+ }
+ }
+
/* -----------------------------------------------------------------------
* CHECKSUMS
* --------------------------------------------------------------------- */
--
You received this message because you are subscribed to the Google Groups
"Qubit Toolkit Commits" group.
To post to this group, send email to [email protected].
To unsubscribe from this group, send email to
[email protected].
For more options, visit this group at
http://groups.google.com/group/qubit-commits?hl=en.