tdraier 2005/11/30 20:24:17 CET
Modified files:
core/src/java/org/jahia/hibernate/dao JahiaIndexingJobDAO.java
core/src/java/org/jahia/services/fileextraction
FileExtractor.java
JahiaFileExtractionService.java
JahiaFileExtractionServiceImpl.java
JahiaMSExcelExtractor.java
JahiaMSPowerPointExtractor.java
JahiaMSWordExtractor.java
JahiaOfficeExtractor.java
MP3Extractor.java
PDFExtractor.java
TextExtractor.java
core/src/java/org/jahia/services/search/lucene/jdbc
LuceneJDBCSearchIndexer.java
core/src/java/org/jahia/services/search/lucene
AbstractLuceneSearchIndexer.java
core/src/java/org/jahia/services/search/valves
FieldSearchIndexProcessValveImpl.java
FileFieldIndexingThread.java
core/src/java/org/jahia/services/search
JahiaSearchBaseService.java
Log:
few optimizations - removed useless charset detection, check existence of
indexing jobs before enqueuing
Revision Changes Path
1.4 +26 -1
jahia/core/src/java/org/jahia/hibernate/dao/JahiaIndexingJobDAO.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/core/src/java/org/jahia/hibernate/dao/JahiaIndexingJobDAO.java.diff?r1=1.3&r2=1.4&f=h
1.2 +7 -33
jahia/core/src/java/org/jahia/services/fileextraction/FileExtractor.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/core/src/java/org/jahia/services/fileextraction/FileExtractor.java.diff?r1=1.1&r2=1.2&f=h
1.3 +9 -27
jahia/core/src/java/org/jahia/services/fileextraction/JahiaFileExtractionService.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/core/src/java/org/jahia/services/fileextraction/JahiaFileExtractionService.java.diff?r1=1.2&r2=1.3&f=h
1.6 +10 -33
jahia/core/src/java/org/jahia/services/fileextraction/JahiaFileExtractionServiceImpl.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/core/src/java/org/jahia/services/fileextraction/JahiaFileExtractionServiceImpl.java.diff?r1=1.5&r2=1.6&f=h
1.2 +1 -18
jahia/core/src/java/org/jahia/services/fileextraction/JahiaMSExcelExtractor.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/core/src/java/org/jahia/services/fileextraction/JahiaMSExcelExtractor.java.diff?r1=1.1&r2=1.2&f=h
1.2 +1 -18
jahia/core/src/java/org/jahia/services/fileextraction/JahiaMSPowerPointExtractor.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/core/src/java/org/jahia/services/fileextraction/JahiaMSPowerPointExtractor.java.diff?r1=1.1&r2=1.2&f=h
1.2 +1 -18
jahia/core/src/java/org/jahia/services/fileextraction/JahiaMSWordExtractor.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/core/src/java/org/jahia/services/fileextraction/JahiaMSWordExtractor.java.diff?r1=1.1&r2=1.2&f=h
1.5 +10 -39
jahia/core/src/java/org/jahia/services/fileextraction/JahiaOfficeExtractor.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/core/src/java/org/jahia/services/fileextraction/JahiaOfficeExtractor.java.diff?r1=1.4&r2=1.5&f=h
1.4 +4 -37
jahia/core/src/java/org/jahia/services/fileextraction/MP3Extractor.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/core/src/java/org/jahia/services/fileextraction/MP3Extractor.java.diff?r1=1.3&r2=1.4&f=h
1.5 +13 -70
jahia/core/src/java/org/jahia/services/fileextraction/PDFExtractor.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/core/src/java/org/jahia/services/fileextraction/PDFExtractor.java.diff?r1=1.4&r2=1.5&f=h
1.2 +32 -49
jahia/core/src/java/org/jahia/services/fileextraction/TextExtractor.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/core/src/java/org/jahia/services/fileextraction/TextExtractor.java.diff?r1=1.1&r2=1.2&f=h
1.33 +8 -5
jahia/core/src/java/org/jahia/services/search/JahiaSearchBaseService.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/core/src/java/org/jahia/services/search/JahiaSearchBaseService.java.diff?r1=1.32&r2=1.33&f=h
1.9 +2 -2
jahia/core/src/java/org/jahia/services/search/lucene/AbstractLuceneSearchIndexer.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/core/src/java/org/jahia/services/search/lucene/AbstractLuceneSearchIndexer.java.diff?r1=1.8&r2=1.9&f=h
1.3 +1 -1
jahia/core/src/java/org/jahia/services/search/lucene/jdbc/LuceneJDBCSearchIndexer.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/core/src/java/org/jahia/services/search/lucene/jdbc/LuceneJDBCSearchIndexer.java.diff?r1=1.2&r2=1.3&f=h
1.3 +1 -7
jahia/core/src/java/org/jahia/services/search/valves/FieldSearchIndexProcessValveImpl.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/core/src/java/org/jahia/services/search/valves/FieldSearchIndexProcessValveImpl.java.diff?r1=1.2&r2=1.3&f=h
1.3 +2 -8
jahia/core/src/java/org/jahia/services/search/valves/FileFieldIndexingThread.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/core/src/java/org/jahia/services/search/valves/FileFieldIndexingThread.java.diff?r1=1.2&r2=1.3&f=h
Index: JahiaIndexingJobDAO.java
===================================================================
RCS file:
/home/cvs/repository/jahia/core/src/java/org/jahia/hibernate/dao/JahiaIndexingJobDAO.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- JahiaIndexingJobDAO.java 30 Nov 2005 16:23:56 -0000 1.3
+++ JahiaIndexingJobDAO.java 30 Nov 2005 19:24:15 -0000 1.4
@@ -3,7 +3,9 @@
*/
package org.jahia.hibernate.dao;
-import org.jahia.hibernate.model.indexingjob.JahiaIndexingJob;
+import org.jahia.hibernate.model.indexingjob.*;
+import org.jahia.services.search.indexingscheduler.IndexingJob;
+import org.jahia.services.search.indexingscheduler.impl.*;
import org.springframework.orm.hibernate3.HibernateTemplate;
import org.springframework.orm.hibernate3.support.HibernateDaoSupport;
@@ -67,6 +69,29 @@
public synchronized void save(JahiaIndexingJob job) {
HibernateTemplate template = getHibernateTemplate();
+
+ List old = null;
+ if ( job instanceof JahiaFieldIndexingJob ){
+ String query = "from JahiaFieldIndexingJob as indexingJob where
indexingJob.fieldId = ?";
+ old = template.find(query,
((JahiaFieldIndexingJob)job).getFieldId());
+ } else if ( job instanceof JahiaContainerIndexingJob ) {
+ String query = "from JahiaContainerIndexingJob as indexingJob
where indexingJob.ctnId = ?";
+ old = template.find(query,
((JahiaContainerIndexingJob)job).getCtnId());
+ } else if ( job instanceof JahiaContainerListIndexingJob ) {
+ String query = "from JahiaContainerListIndexingJob as
indexingJob where indexingJob.ctnListId = ?";
+ old = template.find(query,
((JahiaContainerListIndexingJob)job).getCtnListId());
+ } else if ( job instanceof JahiaPageIndexingJob ) {
+ String query = "from JahiaPageIndexingJob as indexingJob where
indexingJob.pageId = ?";
+ old = template.find(query,
((JahiaPageIndexingJob)job).getPageId());
+ } else if ( job instanceof JahiaRemoveFromIndexJob ){
+ String query = "from JahiaRemoveFromIndexJob as indexingJob
where indexingJob.keyFieldName = ? and indexingJob.keyFieldValue = ?";
+ old = template.find(query, new Object[]
{((JahiaRemoveFromIndexJob)job).getKeyFieldName(),
((JahiaRemoveFromIndexJob)job).getKeyFieldValue()});
+ }
+
+ if (old != null && !old.isEmpty()) {
+ job.setId(((JahiaIndexingJob)old.iterator().next()).getId());
+ }
+
template.merge(job);
}
}
Index: FileExtractor.java
===================================================================
RCS file:
/home/cvs/repository/jahia/core/src/java/org/jahia/services/fileextraction/FileExtractor.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- FileExtractor.java 14 Feb 2005 14:22:39 -0000 1.1
+++ FileExtractor.java 30 Nov 2005 19:24:15 -0000 1.2
@@ -5,34 +5,20 @@
public interface FileExtractor {
+
/**
- *
- * @param path String
+ *
+ * @param path String
* @param lastModified long
* @param fileStream InputStream
* @throws Exception
* @return String
- */
- public abstract ExtractedDocument getExtractedDocument(
- String path,
- long lastModified,
- InputStream fileStream)
- throws Exception;
-
- /**
- *
- * @param path String
- * @param lastModified long
- * @param fileStream InputStream
- * @param charSet String
- * @throws Exception
- * @return String
*/
public abstract ExtractedDocument getExtractedDocument(
- String path,
- long lastModified,
- InputStream fileStream,
- String charSet)
+ String path,
+ long lastModified,
+ InputStream fileStream
+ )
throws Exception;
/**
@@ -48,16 +34,4 @@
InputStream fileStream)
throws Exception;
- /**
- *
- * @param path String
- * @param lastModified long
- * @param fileStream InputStream
- * @return String
- */
- public abstract String getContentAsString(String path,
- long lastModified,
- InputStream fileStream,
- String charSet) throws
Exception;
-
}
Index: JahiaFileExtractionService.java
===================================================================
RCS file:
/home/cvs/repository/jahia/core/src/java/org/jahia/services/fileextraction/JahiaFileExtractionService.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- JahiaFileExtractionService.java 30 Sep 2005 15:34:01 -0000 1.2
+++ JahiaFileExtractionService.java 30 Nov 2005 19:24:15 -0000 1.3
@@ -36,22 +36,6 @@
public abstract FileExtractor getFileExtractor(String contentType)
throws JahiaException;
- /**
- *
- * @param contentType
- * @param cacheKey , a unique key used to cache the extraction (
serializing on disk )
- * @param lastModified
- * @param allowCache if true, use a cached extractio if any
- * @param fileStream
- * @return
- * @throws Exception
- */
- public abstract ExtractedDocument getExtractedDocument( String
contentType,
- String cacheKey,
- long lastModified,
- boolean allowCache,
- InputStream fileStream)
- throws Exception;
/**
*
@@ -60,29 +44,27 @@
* @param lastModified
* @param allowCache if true, use a cached extractio if any
* @param fileStream
- * @param charSet
* @return
* @throws Exception
*/
- public abstract ExtractedDocument getExtractedDocument( String
contentType,
- String cacheKey,
- long lastModified,
- boolean allowCache,
- InputStream fileStream,
- String charSet)
+ public abstract ExtractedDocument getExtractedDocument(String
contentType,
+ String cacheKey,
+ long lastModified,
+ boolean
allowCache,
+ InputStream
fileStream
+ )
throws Exception;
/**
*
* @param contentType
* @param cacheKey , a unique key used to cache the extraction (
serializing on disk )
- * @param charSet
* @return
* @throws Exception
*/
- public abstract ExtractedDocument getAlreadyExtractedDocument( String
contentType,
- String
cacheKey,
- String
charSet )
+ public abstract ExtractedDocument getAlreadyExtractedDocument(String
contentType,
+ String
cacheKey
+ )
throws Exception;
Index: JahiaFileExtractionServiceImpl.java
===================================================================
RCS file:
/home/cvs/repository/jahia/core/src/java/org/jahia/services/fileextraction/JahiaFileExtractionServiceImpl.java,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -r1.5 -r1.6
--- JahiaFileExtractionServiceImpl.java 30 Sep 2005 15:34:01 -0000
1.5
+++ JahiaFileExtractionServiceImpl.java 30 Nov 2005 19:24:15 -0000
1.6
@@ -21,11 +21,9 @@
import java.io.ObjectOutput;
import java.io.ObjectOutputStream;
import java.util.Properties;
-import java.util.HashMap;
import org.jahia.exceptions.JahiaException;
import org.jahia.exceptions.JahiaInitializationException;
-import org.jahia.settings.SettingsBean;
import org.jahia.utils.JahiaTools;
/**
@@ -126,39 +124,19 @@
/**
*
* @param contentType
- * @param cacheKey a unique key used to cache the extraction (
serializing on disk )
- * @param lastModified
- * @param allowCache if true, use a cached extractio if any
- * @param fileStream
- * @return
- * @throws Exception
- */
- public ExtractedDocument getExtractedDocument( String contentType,
- String cacheKey,
- long lastModified,
- boolean allowCache,
- InputStream fileStream)
- throws Exception {
- return getExtractedDocument(contentType, cacheKey,lastModified,
allowCache, fileStream, null);
- }
-
- /**
- *
- * @param contentType
* @param cacheKey , a unique key used to cache the extraction (
serializing on disk )
* @param lastModified
* @param allowCache if true, use a cached extractio if any
* @param fileStream
- * @param charSet
* @return
* @throws Exception
*/
- public ExtractedDocument getExtractedDocument( String contentType,
- String cacheKey,
- long lastModified,
- boolean allowCache,
- InputStream fileStream,
- String charSet )
+ public ExtractedDocument getExtractedDocument(String contentType,
+ String cacheKey,
+ long lastModified,
+ boolean allowCache,
+ InputStream fileStream
+ )
throws Exception {
ExtractedDocument extDoc = null;
@@ -194,7 +172,7 @@
}
}
if ( extDoc == null ){
- extDoc =
extractor.getExtractedDocument(cacheKey,lastModified,fileStream,charSet);
+ extDoc =
extractor.getExtractedDocument(cacheKey,lastModified,fileStream);
if ( extDoc == null ){
extDoc = new ExtractedDocumentImpl();
}
@@ -221,13 +199,12 @@
*
* @param contentType
* @param cacheKey , a unique key used to cache the extraction (
serializing on disk )
- * @param charSet
* @return
* @throws Exception
*/
- public ExtractedDocument getAlreadyExtractedDocument( String
contentType,
- String cacheKey,
- String charSet )
+ public ExtractedDocument getAlreadyExtractedDocument(String contentType,
+ String cacheKey
+ )
throws Exception {
ExtractedDocument extDoc = null;
Index: JahiaMSExcelExtractor.java
===================================================================
RCS file:
/home/cvs/repository/jahia/core/src/java/org/jahia/services/fileextraction/JahiaMSExcelExtractor.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- JahiaMSExcelExtractor.java 14 Feb 2005 14:22:39 -0000 1.1
+++ JahiaMSExcelExtractor.java 30 Nov 2005 19:24:15 -0000 1.2
@@ -26,22 +26,6 @@
public JahiaMSExcelExtractor(){
}
-
- /**
- * This method returns all the text of an HDF file.
- *
- * @param path String
- * @param lastModified long
- * @param fileStream InputStream
- * @throws IOException thrown if there was an error while parsing the
- * file format, notably if the file is an RTF file instead of a HDF file.
- * @return String
- */
- public String getContentAsString(String path, long lastModified,
- InputStream fileStream)
- throws IOException {
- return getContentAsString(path, lastModified, fileStream, null);
- }
/**
* This method returns all the text of an HDF file.
@@ -49,12 +33,11 @@
* @param path String
* @param lastModified long
* @param fileStream InputStream
- * @param charSet String
* @throws IOException thrown if there was an error while parsing the
file
* @return String
*/
public String getContentAsString(String path, long lastModified,
- InputStream fileStream, String charSet)
+ InputStream fileStream)
throws IOException {
MSExcelExtractor ex = new MSExcelExtractor("","","");
try {
Index: JahiaMSPowerPointExtractor.java
===================================================================
RCS file:
/home/cvs/repository/jahia/core/src/java/org/jahia/services/fileextraction/JahiaMSPowerPointExtractor.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- JahiaMSPowerPointExtractor.java 14 Feb 2005 14:22:39 -0000 1.1
+++ JahiaMSPowerPointExtractor.java 30 Nov 2005 19:24:15 -0000 1.2
@@ -34,28 +34,11 @@
* @param path String
* @param lastModified long
* @param fileStream InputStream
- * @throws IOException thrown if there was an error while parsing the
- * file format, notably if the file is an RTF file instead of a HDF file.
- * @return String
- */
- public String getContentAsString(String path, long lastModified,
- InputStream fileStream)
- throws IOException {
- return getContentAsString(path, lastModified, fileStream, null);
- }
-
- /**
- * This method returns all the text of an HDF file.
- *
- * @param path String
- * @param lastModified long
- * @param fileStream InputStream
- * @param charSet String
* @throws IOException thrown if there was an error while parsing the
file
* @return String
*/
public String getContentAsString(String path, long lastModified,
- InputStream fileStream, String charSet)
+ InputStream fileStream)
throws IOException {
MSPowerPointExtractor ex = new MSPowerPointExtractor("","","");
try {
Index: JahiaMSWordExtractor.java
===================================================================
RCS file:
/home/cvs/repository/jahia/core/src/java/org/jahia/services/fileextraction/JahiaMSWordExtractor.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- JahiaMSWordExtractor.java 14 Feb 2005 14:22:39 -0000 1.1
+++ JahiaMSWordExtractor.java 30 Nov 2005 19:24:15 -0000 1.2
@@ -33,28 +33,11 @@
* @param path String
* @param lastModified long
* @param fileStream InputStream
- * @throws IOException thrown if there was an error while parsing the
- * file format, notably if the file is an RTF file instead of a HDF file.
- * @return String
- */
- public String getContentAsString(String path, long lastModified,
- InputStream fileStream)
- throws IOException {
- return getContentAsString(path, lastModified, fileStream, null);
- }
-
- /**
- * This method returns all the text of an HDF file.
- *
- * @param path String
- * @param lastModified long
- * @param fileStream InputStream
- * @param charSet String
* @throws IOException thrown if there was an error while parsing the
file
* @return String
*/
public String getContentAsString(String path, long lastModified,
- InputStream fileStream, String charSet)
+ InputStream fileStream)
throws IOException {
MSWordExtractor ex = new MSWordExtractor("","","");
try {
Index: JahiaOfficeExtractor.java
===================================================================
RCS file:
/home/cvs/repository/jahia/core/src/java/org/jahia/services/fileextraction/JahiaOfficeExtractor.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- JahiaOfficeExtractor.java 11 Oct 2005 14:17:31 -0000 1.4
+++ JahiaOfficeExtractor.java 30 Nov 2005 19:24:15 -0000 1.5
@@ -9,7 +9,6 @@
import org.apache.commons.io.CopyUtils;
import org.apache.slide.extractor.OfficeExtractor;
-import org.apache.slide.common.Domain;
import org.apache.slide.common.PropertyName;
import org.jahia.services.sites.JahiaSitesSlideService;
@@ -20,36 +19,20 @@
org.apache.log4j.Logger.getLogger (JahiaOfficeExtractor.class);
+
/**
- *
- * @param path String
+ *
+ * @param path String
* @param lastModified long
* @param fileStream InputStream
* @throws Exception
* @return String
- */
- public ExtractedDocument getExtractedDocument(
- String path,
- long lastModified,
- InputStream fileStream)
- throws Exception{
- return this.getExtractedDocument(path,lastModified,fileStream,null);
- }
-
- /**
- *
- * @param path String
- * @param lastModified long
- * @param fileStream InputStream
- * @param charSet String
- * @throws Exception
- * @return String
*/
public ExtractedDocument getExtractedDocument(
- String path,
- long lastModified,
- InputStream fileStream,
- String charSet)
+ String path,
+ long lastModified,
+ InputStream fileStream
+ )
throws Exception{
//create a tmp output stream with the size of the content.
@@ -58,23 +41,11 @@
out.flush();
byte[] contents = out.toByteArray();
ExtractedDocument extDoc = this.getPropertiesExtractedDocument(new
ByteArrayInputStream(contents));
- extDoc.setContent(this.getContentAsString(path,lastModified,new
ByteArrayInputStream(contents),charSet));
+ extDoc.setContent(this.getContentAsString(path,lastModified,new
ByteArrayInputStream(contents)));
return extDoc;
}
- /**
- *
- * @param path String
- * @param lastModified long
- * @param fileStream InputStream
- * @throws Exception
- * @return String
- */
- public abstract String getContentAsString(String path,
- long lastModified,
- InputStream fileStream)
- throws Exception;
/**
*
@@ -85,8 +56,8 @@
*/
public abstract String getContentAsString(String path,
long lastModified,
- InputStream fileStream,
- String charSet) throws
Exception;
+ InputStream fileStream
+ ) throws Exception;
public Map extract(InputStream content){
Map map = null;
Index: MP3Extractor.java
===================================================================
RCS file:
/home/cvs/repository/jahia/core/src/java/org/jahia/services/fileextraction/MP3Extractor.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- MP3Extractor.java 25 Feb 2005 12:50:45 -0000 1.3
+++ MP3Extractor.java 30 Nov 2005 19:24:15 -0000 1.4
@@ -39,30 +39,13 @@
* @param path String
* @param lastModified long
* @param fileStream InputStream
- * @throws Exception
- * @return String
- * @todo Implement this org.jahia.services.fileextraction.FileExtractor
- * method
- */
- public String getContentAsString (String path, long lastModified,
- InputStream fileStream)
- throws Exception {
- return getContentAsString(path, lastModified, fileStream, null);
- }
-
- /**
- *
- * @param path String
- * @param lastModified long
- * @param fileStream InputStream
- * @param charSet String
* @return String
* @throws Exception
* @todo Implement this org.jahia.services.fileextraction.FileExtractor
* method
*/
- public String getContentAsString (String path, long lastModified,
- InputStream fileStream, String charSet)
+ public String getContentAsString(String path, long lastModified,
+ InputStream fileStream)
throws Exception {
StringBuffer contentBuf = new StringBuffer();
@@ -106,35 +89,19 @@
return contentBuf.toString();
}
- /**
- *
- * @param path String
- * @param lastModified long
- * @param fileStream InputStream
- * @throws Exception
- * @return String
- * @todo Implement this org.jahia.services.fileextraction.FileExtractor
- * method
- */
- public ExtractedDocument getExtractedDocument (String path,
- long lastModified, InputStream fileStream)
- throws Exception {
- return getExtractedDocument(path, lastModified, fileStream, null);
- }
/**
*
* @param path String
* @param lastModified long
* @param fileStream InputStream
- * @param charSet String
* @throws Exception
* @return String
* @todo Implement this org.jahia.services.fileextraction.FileExtractor
* method
*/
- public ExtractedDocument getExtractedDocument (String path,
- long lastModified, InputStream fileStream, String charSet)
+ public ExtractedDocument getExtractedDocument(String path,
+ long lastModified,
InputStream fileStream)
throws Exception {
ExtractedDocumentImpl extDoc = new ExtractedDocumentImpl();
Index: PDFExtractor.java
===================================================================
RCS file:
/home/cvs/repository/jahia/core/src/java/org/jahia/services/fileextraction/PDFExtractor.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- PDFExtractor.java 8 Apr 2005 12:48:40 -0000 1.4
+++ PDFExtractor.java 30 Nov 2005 19:24:15 -0000 1.5
@@ -6,13 +6,8 @@
import org.jahia.utils.*;
import org.jahia.services.sites.JahiaSitesSlideService;
import org.pdfbox.cos.COSDictionary;
-import org.pdfbox.cos.COSDocument;
import org.pdfbox.cos.COSName;
import org.pdfbox.cos.COSString;
-import org.pdfbox.encryption.DocumentEncryption;
-import org.pdfbox.exceptions.CryptographyException;
-import org.pdfbox.exceptions.InvalidPasswordException;
-import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.util.PDFTextStripper;
import org.pdfbox.util.*;
import org.pdfbox.pdmodel.PDDocument;
@@ -52,10 +47,10 @@
* @return String
*/
public synchronized ExtractedDocument getExtractedDocument(
- String path,
- long lastModified,
- InputStream fileStream,
- String charSet)
+ String path,
+ long lastModified,
+ InputStream fileStream
+ )
throws Exception{
ExtractedDocumentImpl extDoc = new ExtractedDocumentImpl();
@@ -72,22 +67,13 @@
}
//create a tmp output stream with the size of the content.
- ByteArrayOutputStream out = new ByteArrayOutputStream();
- OutputStreamWriter writer = null;
- if ( charSet != null ) {
- writer = new OutputStreamWriter(out,charSet);
- } else {
- writer = new OutputStreamWriter(out);
- }
+ StringWriter writer = new StringWriter();
+
PDFTextStripper stripper = new PDFTextStripper();
stripper.writeText(pdfDocument, writer);
writer.close();
- String content = out.toString(charSet);
- if ( content == null ){
- content = "";
- }
- extDoc.setContent(content);
+ extDoc.setContent(writer.toString());
PDDocumentInformation info =
pdfDocument.getDocumentInformation();
/*
@@ -168,40 +154,9 @@
* @throws Exception
* @return String
*/
- public synchronized ExtractedDocument getExtractedDocument(
- String path,
- long lastModified,
- InputStream fileStream)
- throws Exception{
- return this.getExtractedDocument(path, lastModified, fileStream,
null);
- }
-
- /**
- *
- * @param path String
- * @param lastModified long
- * @param fileStream InputStream
- * @throws Exception
- * @return String
- */
public String getContentAsString(String path, long lastModified,
- InputStream fileStream)
- throws Exception {
- return getContentAsString(path, lastModified, fileStream, null);
- }
-
- /**
- *
- * @param path String
- * @param lastModified long
- * @param fileStream InputStream
- * @param charSet String
- * @throws Exception
- * @return String
- */
- public String getContentAsString(String path, long lastModified,
- InputStream fileStream,
- String charSet) throws Exception {
+ InputStream fileStream
+ ) throws Exception {
this.path = path;
this.lastModifed = lastModified;
String strVal = null;
@@ -210,7 +165,7 @@
Reader pdfReader = null;
try {
long startTime = System.currentTimeMillis();
- pdfReader = this.getPDFReader(fileStream, charSet);
+ pdfReader = this.getPDFReader(fileStream);
long elapsedTime = System.currentTimeMillis() - startTime;
logger.info("Finished pdf extraction with PDFBox in " +
elapsedTime + "ms.");
@@ -238,12 +193,7 @@
return strVal;
}
- public Reader getPDFReader(InputStream fileStream) throws IOException {
- return getPDFReader(fileStream, null);
- }
-
- public Reader getPDFReader(InputStream fileStream,
- String charSet) throws IOException
+ public Reader getPDFReader(InputStream fileStream) throws IOException
{
Reader reader = null;
PDDocument pdfDocument = null;
@@ -254,18 +204,11 @@
pdfDocument.decrypt("");
}
//create a tmp output stream with the size of the content.
- ByteArrayOutputStream out = new ByteArrayOutputStream();
- OutputStreamWriter writer = new OutputStreamWriter(out);
+ StringWriter writer = new StringWriter();
PDFTextStripper stripper = new PDFTextStripper();
stripper.writeText(pdfDocument, writer);
writer.close();
- byte[] contents = out.toByteArray();
- if ( charSet != null ){
- reader = new InputStreamReader(new
ByteArrayInputStream(contents),
- charSet);
- } else {
- reader = new InputStreamReader(new
ByteArrayInputStream(contents));
- }
+ reader = new StringReader(writer.toString());
}
catch( Throwable t )
{
Index: TextExtractor.java
===================================================================
RCS file:
/home/cvs/repository/jahia/core/src/java/org/jahia/services/fileextraction/TextExtractor.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- TextExtractor.java 14 Feb 2005 14:22:39 -0000 1.1
+++ TextExtractor.java 30 Nov 2005 19:24:15 -0000 1.2
@@ -3,8 +3,11 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
+import java.io.ByteArrayOutputStream;
import org.jahia.utils.FileUtils;
+import org.mozilla.intl.chardet.nsDetector;
+import org.mozilla.intl.chardet.nsPSMDetector;
/**
@@ -27,39 +30,22 @@
/**
*
* @param path String
- * @param lastModified long
- * @param fileStream InputStream
- * @throws Exception
- * @return String
- */
- public ExtractedDocument getExtractedDocument(
- String path,
- long lastModified,
- InputStream fileStream)
- throws Exception{
- return getExtractedDocument(path,lastModified,fileStream,null);
- }
-
- /**
- *
- * @param path String
- * @param lastModified long
- * @param fileStream InputStream
- * @param charSet String
- * @throws Exception
- * @return String
+ * @param lastModified long
+ * @param fileStream InputStream
+ * @throws Exception
+ * @return String
*/
public ExtractedDocument getExtractedDocument(
- String path,
- long lastModified,
- InputStream fileStream,
- String charSet)
+ String path,
+ long lastModified,
+ InputStream fileStream
+ )
throws Exception{
- ExtractedDocumentImpl extDoc = new ExtractedDocumentImpl();
-
extDoc.setContent(this.getContentAsString(path,lastModified,fileStream,charSet));
+ ExtractedDocumentImpl extDoc = new ExtractedDocumentImpl();
+
extDoc.setContent(this.getContentAsString(path,lastModified,fileStream));
return extDoc;
}
-
+
/**
* This method returns all the text of an HDF file.
*
@@ -73,29 +59,26 @@
public String getContentAsString(String path, long lastModified,
InputStream fileStream)
throws IOException {
- return getContentAsString(path, lastModified, fileStream, null);
- }
- /**
- * This method returns all the text of an HDF file.
- *
- * @param path String
- * @param lastModified long
- * @param fileStream InputStream
- * @param charSet String
- * @throws IOException thrown if there was an error while parsing the
- * file format, notably if the file is an RTF file instead of a HDF file.
- * @return String
- */
- public String getContentAsString(String path, long lastModified,
- InputStream fileStream, String charSet)
- throws IOException {
- InputStreamReader reader = null;
- if ( charSet != null ){
- reader = new InputStreamReader(fileStream, charSet);
+
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ byte[] b = new byte[1024];
+ int l = 0;
+ while ((l = fileStream.read(b)) > 0) {
+ baos.write(b,0,l);
+ }
+ nsDetector det = new nsDetector(nsPSMDetector.ALL);
+ byte[] bytes = baos.toByteArray();
+ det.Init(null);
+ det.DoIt(bytes, bytes.length, true);
+ det.DataEnd();
+
+ // charset detection does not seem to work very well
+ String[]charSets = det.getProbableCharsets();
+ if (charSets.length>0) {
+ return new String(bytes,charSets[0]);
} else {
- reader = new InputStreamReader(fileStream);
+ return new String(bytes);
}
- return FileUtils.readerToString(reader);
}
}
Index: LuceneJDBCSearchIndexer.java
===================================================================
RCS file:
/home/cvs/repository/jahia/core/src/java/org/jahia/services/search/lucene/jdbc/LuceneJDBCSearchIndexer.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- LuceneJDBCSearchIndexer.java 27 Apr 2005 13:37:58 -0000 1.2
+++ LuceneJDBCSearchIndexer.java 30 Nov 2005 19:24:16 -0000 1.3
@@ -26,7 +26,7 @@
/**
* Created by IntelliJ IDEA.
* User: hollis
- * Date: 15 févr. 2005
+ * Date: 15 f�vr. 2005
* Time: 13:05:57
* To change this template use File | Settings | File Templates.
*/
Index: AbstractLuceneSearchIndexer.java
===================================================================
RCS file:
/home/cvs/repository/jahia/core/src/java/org/jahia/services/search/lucene/AbstractLuceneSearchIndexer.java,v
retrieving revision 1.8
retrieving revision 1.9
diff -u -r1.8 -r1.9
--- AbstractLuceneSearchIndexer.java 29 Nov 2005 13:12:27 -0000 1.8
+++ AbstractLuceneSearchIndexer.java 30 Nov 2005 19:24:16 -0000 1.9
@@ -24,7 +24,7 @@
/**
* Created by IntelliJ IDEA.
* User: hollis
- * Date: 15 févr. 2005
+ * Date: 15 f�vr. 2005
* Time: 13:05:57
* To change this template use File | Settings | File Templates.
*/
@@ -859,4 +859,4 @@
}
}
-}
+}
\ No newline at end of file
Index: FieldSearchIndexProcessValveImpl.java
===================================================================
RCS file:
/home/cvs/repository/jahia/core/src/java/org/jahia/services/search/valves/FieldSearchIndexProcessValveImpl.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- FieldSearchIndexProcessValveImpl.java 11 Oct 2005 14:18:43 -0000
1.2
+++ FieldSearchIndexProcessValveImpl.java 30 Nov 2005 19:24:17 -0000
1.3
@@ -150,16 +150,10 @@
doc.setFieldValue(JahiaSearchConstant.FILE_CONTENT_TYPE,contentType);
if (contentType != null && !file.getPath().equals("#")) {
try {
- InputStream ins = file.downloadFile();
- String charSet = null; // by default open as
ascii
- CharsetDetection charsetDet = new
CharsetDetection();
- charsetDet.charsetDetection(ins);
- charSet = charsetDet.getCharset();
- ins.close();
ExtractedDocument extDoc = ServicesRegistry
.getInstance().getFileExtractionService()
.getAlreadyExtractedDocument(
-
contentType,file.getPath(),charSet);
+ contentType,file.getPath());
if ( extDoc != null ){
strVal = extDoc.getContentAsString();
doc.addFieldValue(JahiaSearchConstant.FILE_CONTENT_FULLTEXT_SEARCH_FIELD,strVal);
Index: FileFieldIndexingThread.java
===================================================================
RCS file:
/home/cvs/repository/jahia/core/src/java/org/jahia/services/search/valves/FileFieldIndexingThread.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- FileFieldIndexingThread.java 24 Oct 2005 12:42:29 -0000 1.2
+++ FileFieldIndexingThread.java 30 Nov 2005 19:24:17 -0000 1.3
@@ -52,12 +52,6 @@
String contentType = fField.getType();
if (contentType != null && !file.getPath().equals("#")) {
try {
- InputStream ins = file.downloadFile();
- String charSet = null; // by default open as
ascii
- CharsetDetection charsetDet = new
CharsetDetection();
- charsetDet.charsetDetection(ins);
- charSet = charsetDet.getCharset();
- ins.close();
long lastModifiedDate =
System.currentTimeMillis();
try {
lastModifiedDate = file.getJahiaFileField()
@@ -65,10 +59,10 @@
} catch (Throwable t) {
logger.debug(t);
}
- ins = file.downloadFile();
+ InputStream ins = file.downloadFile();
ServicesRegistry.getInstance().getFileExtractionService()
.getExtractedDocument(contentType,
file.getPath(), lastModifiedDate,
- true, ins, charSet);
+ true, ins);
ins.close();
ServicesRegistry.getInstance().getJahiaSearchService()
.indexField(field.getID(),
this.context.getUser(),false, false);
Index: JahiaSearchBaseService.java
===================================================================
RCS file:
/home/cvs/repository/jahia/core/src/java/org/jahia/services/search/JahiaSearchBaseService.java,v
retrieving revision 1.32
retrieving revision 1.33
diff -u -r1.32 -r1.33
--- JahiaSearchBaseService.java 30 Nov 2005 16:25:16 -0000 1.32
+++ JahiaSearchBaseService.java 30 Nov 2005 19:24:17 -0000 1.33
@@ -47,7 +47,6 @@
import org.jahia.services.scheduler.SchedulerService;
import org.jahia.services.search.indexingscheduler.IndexingJob;
import org.jahia.services.search.indexingscheduler.IndexingJobConsummer;
-import org.jahia.services.search.indexingscheduler.IndexingJobTools;
import org.jahia.services.search.indexingscheduler.impl.ContainerIndexingJob;
import org.jahia.services.search.indexingscheduler.impl.FieldIndexingJob;
import org.jahia.services.search.indexingscheduler.impl.PageIndexingJob;
@@ -290,16 +289,14 @@
while ( !this.disabled && this.localIndexing ){
cachedAdminUsers = new HashMap();
try {
- Thread.sleep(2000);
+ Thread.sleep(20000);
} catch ( InterruptedException inte ){
}
if ( sitesServ == null || !sitesServ.isStarted() ){
continue;
}
try {
- jobsList = IndexingJobTools.resolveIndexingJobs(
-
this.indJobMgr.getIndexingJobsAfter(this.getLastIndexingJobTime(),
- false));
+ jobsList =
this.indJobMgr.getIndexingJobsAfter(this.getLastIndexingJobTime(),false);
Iterator jobs = jobsList.iterator();
if ( jobsList.size()>0 ){
while (jobs.hasNext()) {
@@ -552,11 +549,13 @@
"".equals(keyFieldName.trim()))
return;
+
RemoveFromIndexJob job = null;
if (allowQueuing || (!allowQueuing && notifyCluster)) {
job = new RemoveFromIndexJob(
siteId, keyFieldName, keyFieldValue, user.getUserKey(),
System.currentTimeMillis());
+
}
if (!allowQueuing) {
@@ -816,6 +815,7 @@
if (allowQueuing || (!allowQueuing && notifyCluster)) {
indJob =
new ContainerIndexingJob(ctnId, user.getUserKey(),
System.currentTimeMillis());
+
}
if (!allowQueuing) {
if (notifyCluster) {
@@ -912,6 +912,8 @@
if (allowQueuing || (!allowQueuing && notifyCluster)) {
indJob =
new PageIndexingJob(pageId, user.getUserKey(),
System.currentTimeMillis());
+
+
}
if (!allowQueuing) {
if (notifyCluster) {
@@ -1011,6 +1013,7 @@
if (allowQueuing || (!allowQueuing && notifyCluster)) {
indJob =
new FieldIndexingJob(fieldID, user.getUserKey(),
System.currentTimeMillis());
+
}
if (!allowQueuing) {
if (notifyCluster) {