From aff91da6a40ee9ae4203b4f156e44f898aa3b806 Mon Sep 17 00:00:00 2001 From: srosse <none@none> Date: Wed, 25 Apr 2012 14:31:37 +0200 Subject: [PATCH] OO-237: reduce log level for "errors" while indexing documents to warn --- .../RemoteGroupCreationOverXMPP.java | 2 +- .../model/artefacts/FileArtefactHandler.java | 7 +- .../org/olat/search/_spring/searchContext.xml | 4 + .../search/service/SearchServiceFactory.java | 9 -- .../service/document/file/ExcelDocument.java | 3 +- .../document/file/ExcelOOXMLDocument.java | 18 +-- .../document/file/FileDocumentFactory.java | 127 +++++++++--------- .../service/document/file/HtmlDocument.java | 3 +- .../service/document/file/OpenDocument.java | 2 +- .../service/document/file/PPT2Text.java | 91 ------------- .../service/document/file/PdfDocument.java | 7 +- .../document/file/PowerPointDocument.java | 53 +++++++- .../file/PowerPointOOXMLDocument.java | 3 +- .../service/document/file/TextDocument.java | 4 +- .../service/document/file/UnkownDocument.java | 2 +- .../service/document/file/WordDocument.java | 3 +- .../document/file/WordOOXMLDocument.java | 3 +- .../service/document/file/XmlDocument.java | 15 ++- .../search/service/indexer/FolderIndexer.java | 12 +- .../service/indexer/FolderIndexerWorker.java | 12 +- .../service/indexer/FullIndexerStatus.java | 7 +- .../course/DialogCourseNodeIndexer.java | 12 +- .../file/FileDocumentFactoryTest.java | 13 +- 23 files changed, 174 insertions(+), 238 deletions(-) delete mode 100644 src/main/java/org/olat/search/service/document/file/PPT2Text.java diff --git a/src/main/java/org/olat/instantMessaging/syncservice/RemoteGroupCreationOverXMPP.java b/src/main/java/org/olat/instantMessaging/syncservice/RemoteGroupCreationOverXMPP.java index a9c250d3c0c..983550e4f2d 100644 --- a/src/main/java/org/olat/instantMessaging/syncservice/RemoteGroupCreationOverXMPP.java +++ b/src/main/java/org/olat/instantMessaging/syncservice/RemoteGroupCreationOverXMPP.java @@ -185,7 +185,7 @@ public class RemoteGroupCreationOverXMPP implements InstantMessagingGroupSynchro collector.cancel(); if (response == null) { - log.error("Error while trying to create/delete group at IM server. Response was null! packet type: "+packet.getClass()); + log.warn("Error while trying to create/delete group at IM server. Response was null! packet type: "+packet.getClass()); return false; } if (response.getError() != null) { diff --git a/src/main/java/org/olat/portfolio/model/artefacts/FileArtefactHandler.java b/src/main/java/org/olat/portfolio/model/artefacts/FileArtefactHandler.java index ea2d0fbc7df..703d8db6348 100644 --- a/src/main/java/org/olat/portfolio/model/artefacts/FileArtefactHandler.java +++ b/src/main/java/org/olat/portfolio/model/artefacts/FileArtefactHandler.java @@ -21,6 +21,7 @@ package org.olat.portfolio.model.artefacts; import org.apache.lucene.document.Document; +import org.olat.core.CoreSpringFactory; import org.olat.core.commons.modules.bc.FolderConfig; import org.olat.core.commons.modules.bc.meta.MetaInfo; import org.olat.core.commons.modules.bc.meta.MetaInfoFactory; @@ -41,7 +42,6 @@ import org.olat.portfolio.manager.EPFrontendManager; import org.olat.portfolio.ui.artefacts.view.details.FileArtefactDetailsController; import org.olat.repository.RepositoryManager; import org.olat.search.service.SearchResourceContext; -import org.olat.search.service.SearchServiceFactory; import org.olat.search.service.document.file.FileDocumentFactory; /** @@ -166,8 +166,9 @@ public class FileArtefactHandler extends EPAbstractHandler<FileArtefact> { VFSItem file = ePFManager.getArtefactContainer(artefact).resolve(filename); if (file != null && file instanceof VFSLeaf) { try { - if (SearchServiceFactory.getFileDocumentFactory().isFileSupported((VFSLeaf)file)) { - Document doc = FileDocumentFactory.createDocument(context, (VFSLeaf)file); + FileDocumentFactory docFactory = CoreSpringFactory.getImpl(FileDocumentFactory.class); + if (docFactory.isFileSupported((VFSLeaf)file)) { + Document doc = docFactory.createDocument(context, (VFSLeaf)file); String content = doc.get(AbstractOlatDocument.CONTENT_FIELD_NAME); sb.append(content); } diff --git a/src/main/java/org/olat/search/_spring/searchContext.xml b/src/main/java/org/olat/search/_spring/searchContext.xml index a494cad471b..d67ca0bc52f 100644 --- a/src/main/java/org/olat/search/_spring/searchContext.xml +++ b/src/main/java/org/olat/search/_spring/searchContext.xml @@ -55,6 +55,10 @@ <property name="searchQueue" ref="searchQueue"/> <property name="receiveTimeout" value="45000"/> </bean> + + <bean id="fileDocumentFactory" class="org.olat.search.service.document.file.FileDocumentFactory"> + <constructor-arg index="0" ref="searchModule" /> + </bean> <!-- The metadata fields that are supported in the search engine --> <bean id="SearchMetadataFieldsProvider" class="org.olat.search.service.SimpleDublinCoreMetadataFieldsProvider" /> diff --git a/src/main/java/org/olat/search/service/SearchServiceFactory.java b/src/main/java/org/olat/search/service/SearchServiceFactory.java index 6954c9a583e..92729548278 100644 --- a/src/main/java/org/olat/search/service/SearchServiceFactory.java +++ b/src/main/java/org/olat/search/service/SearchServiceFactory.java @@ -26,7 +26,6 @@ package org.olat.search.service; import org.olat.core.commons.services.search.SearchService; -import org.olat.search.service.document.file.FileDocumentFactory; /** * @@ -35,16 +34,12 @@ import org.olat.search.service.document.file.FileDocumentFactory; public class SearchServiceFactory { private static SearchService searchService_; - private static FileDocumentFactory fileDocumentFactory; /** * [used by spring] */ private SearchServiceFactory(SearchService searchService) { searchService_ = searchService; - if (searchService.getSearchModuleConfig() != null) { - fileDocumentFactory = new FileDocumentFactory(searchService.getSearchModuleConfig()); - } } @@ -55,8 +50,4 @@ public class SearchServiceFactory { public static boolean isServiceEnabled() { return searchService_.isEnabled(); } - - public static FileDocumentFactory getFileDocumentFactory() { - return fileDocumentFactory; - } } diff --git a/src/main/java/org/olat/search/service/document/file/ExcelDocument.java b/src/main/java/org/olat/search/service/document/file/ExcelDocument.java index 85eaa0335cb..928848b228a 100644 --- a/src/main/java/org/olat/search/service/document/file/ExcelDocument.java +++ b/src/main/java/org/olat/search/service/document/file/ExcelDocument.java @@ -46,12 +46,13 @@ import org.olat.search.service.SearchResourceContext; * @author Christian Guretzki */ public class ExcelDocument extends FileDocument { + private static final long serialVersionUID = 1592080527374169362L; private static final OLog log = Tracing.createLoggerFor(ExcelDocument.class); public final static String FILE_TYPE = "type.file.excel"; public ExcelDocument() { - super(); + // } public static Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) throws IOException, DocumentException, diff --git a/src/main/java/org/olat/search/service/document/file/ExcelOOXMLDocument.java b/src/main/java/org/olat/search/service/document/file/ExcelOOXMLDocument.java index 9020a84f8b2..a4a649fe433 100644 --- a/src/main/java/org/olat/search/service/document/file/ExcelOOXMLDocument.java +++ b/src/main/java/org/olat/search/service/document/file/ExcelOOXMLDocument.java @@ -28,30 +28,16 @@ package org.olat.search.service.document.file; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; -import java.io.PrintStream; -import java.util.Iterator; import org.apache.lucene.document.Document; -import org.apache.poi.POIXMLDocument; -import org.apache.poi.POIXMLTextExtractor; -import org.apache.poi.extractor.ExtractorFactory; -import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.ss.usermodel.BuiltinFormats; -import org.apache.poi.ss.usermodel.Cell; -import org.apache.poi.ss.usermodel.Comment; import org.apache.poi.ss.usermodel.DataFormatter; -import org.apache.poi.ss.usermodel.HeaderFooter; -import org.apache.poi.ss.usermodel.Row; -import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable; import org.apache.poi.xssf.eventusermodel.XSSFReader; import org.apache.poi.xssf.model.SharedStringsTable; import org.apache.poi.xssf.model.StylesTable; -import org.apache.poi.xssf.usermodel.XSSFCell; import org.apache.poi.xssf.usermodel.XSSFCellStyle; import org.apache.poi.xssf.usermodel.XSSFRichTextString; -import org.apache.poi.xssf.usermodel.XSSFSheet; -import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.olat.core.logging.OLog; import org.olat.core.logging.Tracing; import org.olat.core.util.FileUtils; @@ -61,7 +47,6 @@ import org.olat.search.service.SearchResourceContext; import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTHeaderFooter; import org.xml.sax.Attributes; import org.xml.sax.InputSource; -import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.XMLReaderFactory; @@ -75,12 +60,13 @@ import org.xml.sax.helpers.XMLReaderFactory; * @author srosse, stephane.rosse@frentix.com */ public class ExcelOOXMLDocument extends FileDocument { + private static final long serialVersionUID = -7566825844774480380L; private static final OLog log = Tracing.createLoggerFor(ExcelOOXMLDocument.class); public final static String FILE_TYPE = "type.file.excel"; public ExcelOOXMLDocument() { - super(); + // } public static Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) throws IOException, DocumentException, diff --git a/src/main/java/org/olat/search/service/document/file/FileDocumentFactory.java b/src/main/java/org/olat/search/service/document/file/FileDocumentFactory.java index 480d1390ad2..35660ca8fc3 100644 --- a/src/main/java/org/olat/search/service/document/file/FileDocumentFactory.java +++ b/src/main/java/org/olat/search/service/document/file/FileDocumentFactory.java @@ -33,7 +33,6 @@ import org.olat.core.logging.OLog; import org.olat.core.logging.Tracing; import org.olat.core.util.vfs.LocalImpl; import org.olat.core.util.vfs.VFSLeaf; -import org.olat.core.util.vfs.VFSManager; import org.olat.search.service.SearchResourceContext; /** @@ -53,8 +52,6 @@ public class FileDocumentFactory { private static OLog log = Tracing.createLoggerFor(FileDocumentFactory.class); - static FileDocumentFactory instance; - private final static String PDF_SUFFIX = "pdf"; private final static String EXCEL_SUFFIX = "xls"; private final static String WORD_SUFFIX = "doc"; @@ -85,73 +82,79 @@ public class FileDocumentFactory { * @param searchModule */ public FileDocumentFactory(SearchModule module) { - instance = this; searchModule = module; } - public static Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) - throws DocumentNotImplementedException, IOException, DocumentException, DocumentAccessException { - - String fileName = leaf.getName(); - String suffix = FileTypeDetector.getSuffix(leaf); - if (log.isDebug()) log.debug("suffix=" + suffix); + public Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) + throws IOException, DocumentAccessException { + try { + String fileName = leaf.getName(); + String suffix = FileTypeDetector.getSuffix(leaf); + if (log.isDebug()) log.debug("suffix=" + suffix); - if (PDF_SUFFIX.indexOf(suffix) >= 0) { - if(searchModule.getPdfFileEnabled()) - return PdfDocument.createDocument(leafResourceContext, leaf); - return null; - } - if (HTML_SUFFIX.indexOf(suffix) >= 0) { - return HtmlDocument.createDocument(leafResourceContext, leaf); - } - if (XML_SUFFIX.indexOf(suffix) >= 0) { - if(IMS_MANIFEST_FILE.equals(fileName)) { - return IMSMetadataDocument.createDocument(leafResourceContext, leaf); + Document doc = null; + if (PDF_SUFFIX.indexOf(suffix) >= 0) { + if(searchModule.getPdfFileEnabled()) { + doc = PdfDocument.createDocument(leafResourceContext, leaf); + } + } else if (HTML_SUFFIX.indexOf(suffix) >= 0) { + doc = HtmlDocument.createDocument(leafResourceContext, leaf); + } else if (XML_SUFFIX.indexOf(suffix) >= 0) { + if(IMS_MANIFEST_FILE.equals(fileName)) { + doc = IMSMetadataDocument.createDocument(leafResourceContext, leaf); + } else { + doc = XmlDocument.createDocument(leafResourceContext, leaf); + } + } else if (TEXT_SUFFIX.indexOf(suffix) >= 0) { + doc = TextDocument.createDocument(leafResourceContext, leaf); + //microsoft openxml + } else if (suffix.indexOf(WORD_X_SUFFIX) >= 0) { + doc = WordOOXMLDocument.createDocument(leafResourceContext, leaf); + } else if (suffix.indexOf(EXCEL_X_SUFFIX) >= 0) { + if (searchModule.getExcelFileEnabled()) { + doc = ExcelOOXMLDocument.createDocument(leafResourceContext, leaf); + } + } else if (suffix.indexOf(POWERPOINT_X_SUFFIX) >= 0) { + if(searchModule.getPptFileEnabled()) { + doc = PowerPointOOXMLDocument.createDocument(leafResourceContext, leaf); + } + //microsoft + } else if (WORD_SUFFIX.indexOf(suffix) >= 0) { + doc = WordDocument.createDocument(leafResourceContext, leaf); + } else if (POWERPOINT_SUFFIX.indexOf(suffix) >= 0) { + if(searchModule.getPptFileEnabled()) { + doc = PowerPointDocument.createDocument(leafResourceContext, leaf); + } + } else if (EXCEL_SUFFIX.indexOf(suffix) >= 0) { + if (searchModule.getExcelFileEnabled()) { + doc = ExcelDocument.createDocument(leafResourceContext, leaf); + } + //open document + } else if (OD_TEXT_SUFFIX.indexOf(suffix) >= 0 || OD_SPREADSHEET_SUFFIX.indexOf(suffix) >= 0 + || OD_PRESENTATION_SUFFIX.indexOf(suffix) >= 0 || OD_FORMULA_SUFFIX.indexOf(suffix) >= 0 + || OD_GRAPHIC_SUFFIX.indexOf(suffix) >= 0) { + doc = OpenDocument.createDocument(leafResourceContext, leaf); } - return XmlDocument.createDocument(leafResourceContext, leaf); - } - if (TEXT_SUFFIX.indexOf(suffix) >= 0) { - return TextDocument.createDocument(leafResourceContext, leaf); - } - - //microsoft openxml - if (suffix.indexOf(WORD_X_SUFFIX) >= 0) { - return WordOOXMLDocument.createDocument(leafResourceContext, leaf); - } - if (suffix.indexOf(EXCEL_X_SUFFIX) >= 0) { - if (searchModule.getExcelFileEnabled()) - return ExcelOOXMLDocument.createDocument(leafResourceContext, leaf); - return null; - } - if (suffix.indexOf(POWERPOINT_X_SUFFIX) >= 0) { - if(searchModule.getPptFileEnabled()) - return PowerPointOOXMLDocument.createDocument(leafResourceContext, leaf); - return null; - } - - //microsoft - if (WORD_SUFFIX.indexOf(suffix) >= 0) { - return WordDocument.createDocument(leafResourceContext, leaf); - } - if (POWERPOINT_SUFFIX.indexOf(suffix) >= 0) { - if(searchModule.getPptFileEnabled()) - return PowerPointDocument.createDocument(leafResourceContext, leaf); - return null; + + if(doc == null) { + doc = createUnkownDocument(leafResourceContext, leaf); + } + return doc; + } catch(DocumentNotImplementedException e) { + log.warn("Cannot index document (no indexer for it):" + leaf, e); + return createUnkownDocument(leafResourceContext, leaf); + } catch (DocumentException e) { + log.warn("Cannot index document:" + leaf, e); + return createUnkownDocument(leafResourceContext, leaf); } - if (EXCEL_SUFFIX.indexOf(suffix) >= 0) { - if (searchModule.getExcelFileEnabled()) - return ExcelDocument.createDocument(leafResourceContext, leaf); + } + + private Document createUnkownDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) { + try { + return UnkownDocument.createDocument(leafResourceContext, leaf); + } catch (Exception e) { return null; } - - //open document - if (OD_TEXT_SUFFIX.indexOf(suffix) >= 0 || OD_SPREADSHEET_SUFFIX.indexOf(suffix) >= 0 - || OD_PRESENTATION_SUFFIX.indexOf(suffix) >= 0 || OD_FORMULA_SUFFIX.indexOf(suffix) >= 0 - || OD_GRAPHIC_SUFFIX.indexOf(suffix) >= 0) { - return OpenDocument.createDocument(leafResourceContext, leaf); - } - - return UnkownDocument.createDocument(leafResourceContext, leaf); } /** diff --git a/src/main/java/org/olat/search/service/document/file/HtmlDocument.java b/src/main/java/org/olat/search/service/document/file/HtmlDocument.java index 64679c5fe38..13f5db395f3 100644 --- a/src/main/java/org/olat/search/service/document/file/HtmlDocument.java +++ b/src/main/java/org/olat/search/service/document/file/HtmlDocument.java @@ -41,12 +41,13 @@ import org.olat.search.service.SearchResourceContext; * @author Christian Guretzki */ public class HtmlDocument extends FileDocument { + private static final long serialVersionUID = 4816587780227792396L; private static final OLog log = Tracing.createLoggerFor(HtmlDocument.class); public static final String FILE_TYPE = "type.file.html"; public HtmlDocument() { - super(); + // } public static Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) throws IOException,DocumentException,DocumentAccessException { diff --git a/src/main/java/org/olat/search/service/document/file/OpenDocument.java b/src/main/java/org/olat/search/service/document/file/OpenDocument.java index c8b732bc4dd..fe4b2453818 100644 --- a/src/main/java/org/olat/search/service/document/file/OpenDocument.java +++ b/src/main/java/org/olat/search/service/document/file/OpenDocument.java @@ -48,6 +48,7 @@ import org.xml.sax.helpers.XMLReaderFactory; * @author srosse, stephane.rosse@frentix.com */ public class OpenDocument extends FileDocument { + private static final long serialVersionUID = 7285894180135411850L; private static final OLog log = Tracing.createLoggerFor(OpenDocument.class); public final static String TEXT_FILE_TYPE = "type.file.odt"; @@ -99,7 +100,6 @@ public class OpenDocument extends FileDocument { } catch (DocumentException e) { throw e; } catch (Exception e) { - log.error("", e); throw new DocumentException(e.getMessage()); } finally { FileUtils.closeSafely(zip); diff --git a/src/main/java/org/olat/search/service/document/file/PPT2Text.java b/src/main/java/org/olat/search/service/document/file/PPT2Text.java deleted file mode 100644 index 692c16f8d9d..00000000000 --- a/src/main/java/org/olat/search/service/document/file/PPT2Text.java +++ /dev/null @@ -1,91 +0,0 @@ -/** -* OLAT - Online Learning and Training<br> -* http://www.olat.org -* <p> -* Licensed under the Apache License, Version 2.0 (the "License"); <br> -* you may not use this file except in compliance with the License.<br> -* You may obtain a copy of the License at -* <p> -* http://www.apache.org/licenses/LICENSE-2.0 -* <p> -* Unless required by applicable law or agreed to in writing,<br> -* software distributed under the License is distributed on an "AS IS" BASIS, <br> -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br> -* See the License for the specific language governing permissions and <br> -* limitations under the License. -* <p> -* Copyright (c) since 2004 at Multimedia- & E-Learning Services (MELS),<br> -* University of Zurich, Switzerland. -* <hr> -* <a href="http://www.openolat.org"> -* OpenOLAT - Online Learning and Training</a><br> -* This file has been modified by the OpenOLAT community. Changes are licensed -* under the Apache 2.0 license as the original file. -*/ - -package org.olat.search.service.document.file; - -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; - -import org.apache.poi.poifs.eventfilesystem.POIFSReader; -import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent; -import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener; -import org.apache.poi.poifs.filesystem.DocumentInputStream; -import org.apache.poi.util.LittleEndian; -import org.olat.core.logging.OLog; -import org.olat.core.logging.Tracing; - -/** - * @author Christian Guretzki - */ -public class PPT2Text { - - - public static void extractText(InputStream inStream, OutputStream stream ) throws IOException { - POIFSReader r = new POIFSReader(); - /* Register a listener for *all* documents. */ - r.registerListener(new MyPOIFSReaderListener(stream)); - r.read(inStream); - } - - static class MyPOIFSReaderListener implements POIFSReaderListener { - private static final OLog log = Tracing.createLoggerFor(PPT2Text.class); - - private final OutputStream oStream; - - public MyPOIFSReaderListener(OutputStream oStream) { - this.oStream = oStream; - } - - public void processPOIFSReaderEvent(POIFSReaderEvent event) { - int errorCounter = 0; - - try { - DocumentInputStream dis = null; - dis = event.getStream(); - - byte btoWrite[] = new byte[dis.available()]; - dis.read(btoWrite, 0, dis.available()); - for (int i = 0; i < btoWrite.length - 20; i++) { - long type = LittleEndian.getUShort(btoWrite, i + 2); - long size = LittleEndian.getUInt(btoWrite, i + 4); - if (type == 4008) { - try { - oStream.write(btoWrite, i + 4 + 1, (int) size + 3); - } catch( IndexOutOfBoundsException ex) { - errorCounter++; - } - } - } - } catch (Exception ex) { - // FIXME:chg: Remove general Exception later, for now make it run - log.warn("Can not read PPT content.", ex); - } - if (errorCounter > 0) { - if (log.isDebug()) log.debug("Could not parse ppt properly. There were " + errorCounter + " IndexOutOfBoundsException"); - } - } - } -} \ No newline at end of file diff --git a/src/main/java/org/olat/search/service/document/file/PdfDocument.java b/src/main/java/org/olat/search/service/document/file/PdfDocument.java index 65bc872df09..c82bbace807 100644 --- a/src/main/java/org/olat/search/service/document/file/PdfDocument.java +++ b/src/main/java/org/olat/search/service/document/file/PdfDocument.java @@ -26,7 +26,6 @@ package org.olat.search.service.document.file; import java.io.BufferedInputStream; -import java.io.BufferedOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; @@ -48,6 +47,7 @@ import org.olat.search.service.SearchServiceFactory; * @author Christian Guretzki */ public class PdfDocument extends FileDocument { + private static final long serialVersionUID = 6432923202585881794L; private static OLog log = Tracing.createLoggerFor(PdfDocument.class); public final static String FILE_TYPE = "type.file.pdf"; @@ -59,7 +59,6 @@ public class PdfDocument extends FileDocument { private String filePath; public PdfDocument() { - super(); pdfTextBuffering = SearchServiceFactory.getService().getSearchModuleConfig().getPdfTextBuffering(); pdfTextBufferPath = SearchServiceFactory.getService().getSearchModuleConfig().getPdfTextBufferPath(); } @@ -91,7 +90,7 @@ public class PdfDocument extends FileDocument { return pdfTextTmpFilePath; } - protected String readContent(VFSLeaf leaf) throws DocumentException,DocumentAccessException { + protected String readContent(VFSLeaf leaf) throws DocumentException, DocumentAccessException { try { long startTime = 0; if (log.isDebug()) startTime = System.currentTimeMillis(); @@ -117,7 +116,7 @@ public class PdfDocument extends FileDocument { return pdfText; } catch (DocumentAccessException ex) { // pass exception - throw new DocumentAccessException(ex.getMessage()); + throw ex; } catch (Exception ex) { throw new DocumentException("Can not read PDF content. File=" + leaf.getName() + ";" + ex.getMessage() ); } diff --git a/src/main/java/org/olat/search/service/document/file/PowerPointDocument.java b/src/main/java/org/olat/search/service/document/file/PowerPointDocument.java index 8a7327b7695..372ae8c2ab6 100644 --- a/src/main/java/org/olat/search/service/document/file/PowerPointDocument.java +++ b/src/main/java/org/olat/search/service/document/file/PowerPointDocument.java @@ -28,11 +28,17 @@ package org.olat.search.service.document.file; import java.io.BufferedInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.io.InputStream; import java.io.OutputStream; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.lucene.document.Document; +import org.apache.poi.poifs.eventfilesystem.POIFSReader; +import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent; +import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener; +import org.apache.poi.poifs.filesystem.DocumentInputStream; +import org.apache.poi.util.LittleEndian; import org.olat.core.logging.OLog; import org.olat.core.logging.Tracing; import org.olat.core.util.vfs.VFSLeaf; @@ -43,6 +49,7 @@ import org.olat.search.service.SearchResourceContext; * @author Christian Guretzki */ public class PowerPointDocument extends FileDocument { + private static final long serialVersionUID = -6107766953370631805L; private static final OLog log = Tracing.createLoggerFor(PowerPointDocument.class); public final static String FILE_TYPE = "type.file.ppt"; @@ -69,7 +76,7 @@ public class PowerPointDocument extends FileDocument { try { bis = new BufferedInputStream(leaf.getInputStream()); oStream = new ByteArrayOutputStream(); - PPT2Text.extractText(bis, oStream); + extractText(bis, oStream); String content = oStream.toString(); return removeUnvisibleChars(content); } catch (Exception e) { @@ -95,5 +102,49 @@ public class PowerPointDocument extends FileDocument { String output = m.replaceAll(" "); return output; } + + private void extractText(InputStream inStream, OutputStream stream ) throws IOException { + POIFSReader r = new POIFSReader(); + /* Register a listener for *all* documents. */ + r.registerListener(new MyPOIFSReaderListener(stream)); + r.read(inStream); + } + + private class MyPOIFSReaderListener implements POIFSReaderListener { + private final OutputStream oStream; + + public MyPOIFSReaderListener(OutputStream oStream) { + this.oStream = oStream; + } + + public void processPOIFSReaderEvent(POIFSReaderEvent event) { + int errorCounter = 0; + + try { + DocumentInputStream dis = null; + dis = event.getStream(); + + byte btoWrite[] = new byte[dis.available()]; + dis.read(btoWrite, 0, dis.available()); + for (int i = 0; i < btoWrite.length - 20; i++) { + long type = LittleEndian.getUShort(btoWrite, i + 2); + long size = LittleEndian.getUInt(btoWrite, i + 4); + if (type == 4008) { + try { + oStream.write(btoWrite, i + 4 + 1, (int) size + 3); + } catch( IndexOutOfBoundsException ex) { + errorCounter++; + } + } + } + } catch (Exception ex) { + // FIXME:chg: Remove general Exception later, for now make it run + log.warn("Can not read PPT content.", ex); + } + if (errorCounter > 0) { + if (log.isDebug()) log.debug("Could not parse ppt properly. There were " + errorCounter + " IndexOutOfBoundsException"); + } + } + } } diff --git a/src/main/java/org/olat/search/service/document/file/PowerPointOOXMLDocument.java b/src/main/java/org/olat/search/service/document/file/PowerPointOOXMLDocument.java index f1e69c1f7b9..89f6bef113c 100644 --- a/src/main/java/org/olat/search/service/document/file/PowerPointOOXMLDocument.java +++ b/src/main/java/org/olat/search/service/document/file/PowerPointOOXMLDocument.java @@ -61,12 +61,13 @@ import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry; * @author srosse, stephane.rosse@frentix.com */ public class PowerPointOOXMLDocument extends FileDocument { + private static final long serialVersionUID = 2322994231200065526L; private static final OLog log = Tracing.createLoggerFor(PowerPointOOXMLDocument.class); public final static String FILE_TYPE = "type.file.ppt"; public PowerPointOOXMLDocument() { - super(); + // } public static Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) throws IOException, DocumentException, diff --git a/src/main/java/org/olat/search/service/document/file/TextDocument.java b/src/main/java/org/olat/search/service/document/file/TextDocument.java index b96d8d2a8a7..df0f2eb0c19 100644 --- a/src/main/java/org/olat/search/service/document/file/TextDocument.java +++ b/src/main/java/org/olat/search/service/document/file/TextDocument.java @@ -40,13 +40,13 @@ import org.olat.search.service.SearchResourceContext; * @author Christian Guretzki */ public class TextDocument extends FileDocument { - + private static final long serialVersionUID = 9188038452431819507L; private static final OLog log = Tracing.createLoggerFor(TextDocument.class); public final static String FILE_TYPE = "type.file.text"; public TextDocument() { - super(); + // } public static Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) throws IOException,DocumentException,DocumentAccessException { diff --git a/src/main/java/org/olat/search/service/document/file/UnkownDocument.java b/src/main/java/org/olat/search/service/document/file/UnkownDocument.java index 0fe753d6d91..9a0d48db2b1 100644 --- a/src/main/java/org/olat/search/service/document/file/UnkownDocument.java +++ b/src/main/java/org/olat/search/service/document/file/UnkownDocument.java @@ -34,7 +34,7 @@ import org.olat.core.util.vfs.VFSLeaf; import org.olat.search.service.SearchResourceContext; public class UnkownDocument extends FileDocument { - + private static final long serialVersionUID = 7032285703715695914L; private final static OLog log = Tracing.createLoggerFor(UnkownDocument.class); public final static String UNKOWN_TYPE = "type.file.unkown"; diff --git a/src/main/java/org/olat/search/service/document/file/WordDocument.java b/src/main/java/org/olat/search/service/document/file/WordDocument.java index 282320dcdf0..4df94096b34 100644 --- a/src/main/java/org/olat/search/service/document/file/WordDocument.java +++ b/src/main/java/org/olat/search/service/document/file/WordDocument.java @@ -44,12 +44,13 @@ import org.olat.search.service.SearchResourceContext; * @author Christian Guretzki */ public class WordDocument extends FileDocument { + private static final long serialVersionUID = 1827194935338994490L; private static final OLog log = Tracing.createLoggerFor(WordDocument.class); public final static String FILE_TYPE = "type.file.word"; public WordDocument() { - super(); + // } public static Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) throws IOException,DocumentException,DocumentAccessException { diff --git a/src/main/java/org/olat/search/service/document/file/WordOOXMLDocument.java b/src/main/java/org/olat/search/service/document/file/WordOOXMLDocument.java index b176ba11163..3c82faac8f7 100644 --- a/src/main/java/org/olat/search/service/document/file/WordOOXMLDocument.java +++ b/src/main/java/org/olat/search/service/document/file/WordOOXMLDocument.java @@ -52,12 +52,13 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr; * @author srosse, stephane.rosse@frentix.com */ public class WordOOXMLDocument extends FileDocument { + private static final long serialVersionUID = 3684533132759600322L; private static final OLog log = Tracing.createLoggerFor(WordOOXMLDocument.class); public final static String FILE_TYPE = "type.file.word"; public WordOOXMLDocument() { - super(); + // } public static Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) throws IOException,DocumentException,DocumentAccessException { diff --git a/src/main/java/org/olat/search/service/document/file/XmlDocument.java b/src/main/java/org/olat/search/service/document/file/XmlDocument.java index 6d604d57055..3d3a4f4670b 100644 --- a/src/main/java/org/olat/search/service/document/file/XmlDocument.java +++ b/src/main/java/org/olat/search/service/document/file/XmlDocument.java @@ -41,12 +41,13 @@ import org.olat.search.service.SearchResourceContext; * @author Christian Guretzki */ public class XmlDocument extends FileDocument { + private static final long serialVersionUID = -5486191227086694167L; private static final OLog log = Tracing.createLoggerFor(XmlDocument.class); public static final String FILE_TYPE = "type.file.html"; public XmlDocument() { - super(); + // } public static Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) throws IOException,DocumentException,DocumentAccessException { @@ -62,9 +63,15 @@ public class XmlDocument extends FileDocument { protected String readContent(VFSLeaf leaf) throws IOException { InputStream is = leaf.getInputStream(); // Remove all HTML and Tags - String output = new NekoHTMLFilter().filter(is); - if (log.isDebug() ) log.debug("HTML content without tags :" + output); - FileUtils.closeSafely(is); + String output; + try { + output = new NekoHTMLFilter().filter(is); + if (log.isDebug() ) log.debug("HTML content without tags :" + output); + } catch (Exception e) { + throw new IOException(e); + } finally { + FileUtils.closeSafely(is); + } return output; } diff --git a/src/main/java/org/olat/search/service/indexer/FolderIndexer.java b/src/main/java/org/olat/search/service/indexer/FolderIndexer.java index 3c4b50febc4..e8f6ebafc37 100644 --- a/src/main/java/org/olat/search/service/indexer/FolderIndexer.java +++ b/src/main/java/org/olat/search/service/indexer/FolderIndexer.java @@ -29,16 +29,14 @@ package org.olat.search.service.indexer; import java.io.IOException; import org.apache.lucene.document.Document; +import org.olat.core.CoreSpringFactory; import org.olat.core.commons.persistence.DBFactory; import org.olat.core.util.WorkThreadInformations; import org.olat.core.util.vfs.VFSContainer; import org.olat.core.util.vfs.VFSItem; import org.olat.core.util.vfs.VFSLeaf; import org.olat.search.service.SearchResourceContext; -import org.olat.search.service.SearchServiceFactory; import org.olat.search.service.document.file.DocumentAccessException; -import org.olat.search.service.document.file.DocumentException; -import org.olat.search.service.document.file.DocumentNotImplementedException; import org.olat.search.service.document.file.FileDocumentFactory; /** @@ -95,7 +93,7 @@ public abstract class FolderIndexer extends AbstractHierarchicalIndexer { protected void doIndexVFSLeafByMySelf(SearchResourceContext leafResourceContext, VFSLeaf leaf, OlatFullIndexer indexWriter, String filePath) throws InterruptedException { if (isLogDebugEnabled()) logDebug("Analyse VFSLeaf=" + leaf.getName()); try { - if (SearchServiceFactory.getFileDocumentFactory().isFileSupported(leaf)) { + if (CoreSpringFactory.getImpl(FileDocumentFactory.class).isFileSupported(leaf)) { String myFilePath = ""; if (filePath.endsWith("/")) { myFilePath = filePath + leaf.getName(); @@ -105,17 +103,13 @@ public abstract class FolderIndexer extends AbstractHierarchicalIndexer { leafResourceContext.setFilePath(myFilePath); //fxdiff FXOLAT-97: high CPU load tracker WorkThreadInformations.set("Index VFSLeaf=" + myFilePath + " at " + leafResourceContext.getResourceUrl()); - Document document = FileDocumentFactory.createDocument(leafResourceContext, leaf); + Document document = CoreSpringFactory.getImpl(FileDocumentFactory.class).createDocument(leafResourceContext, leaf); indexWriter.addDocument(document); } else { if (isLogDebugEnabled()) logDebug("Documenttype not supported. file=" + leaf.getName()); } } catch (DocumentAccessException e) { if (isLogDebugEnabled()) logDebug("Can not access document." + e.getMessage()); - } catch (DocumentNotImplementedException e) { - if (isLogDebugEnabled()) logDebug("Documenttype not implemented." + e.getMessage()); - } catch (DocumentException dex) { - if (isLogDebugEnabled()) logDebug("DocumentException: Can not index leaf=" + leaf.getName() + " exception=" + dex.getMessage()); } catch (IOException ioEx) { logWarn("IOException: Can not index leaf=" + leaf.getName(), ioEx); } catch (InterruptedException iex) { diff --git a/src/main/java/org/olat/search/service/indexer/FolderIndexerWorker.java b/src/main/java/org/olat/search/service/indexer/FolderIndexerWorker.java index d82718657e1..1ef753ef45f 100644 --- a/src/main/java/org/olat/search/service/indexer/FolderIndexerWorker.java +++ b/src/main/java/org/olat/search/service/indexer/FolderIndexerWorker.java @@ -29,6 +29,7 @@ package org.olat.search.service.indexer; import java.io.IOException; import org.apache.lucene.document.Document; +import org.olat.core.CoreSpringFactory; import org.olat.core.commons.persistence.DBFactory; import org.olat.core.logging.OLog; import org.olat.core.logging.Tracing; @@ -37,10 +38,7 @@ import org.olat.core.util.vfs.VFSContainer; import org.olat.core.util.vfs.VFSItem; import org.olat.core.util.vfs.VFSLeaf; import org.olat.search.service.SearchResourceContext; -import org.olat.search.service.SearchServiceFactory; import org.olat.search.service.document.file.DocumentAccessException; -import org.olat.search.service.document.file.DocumentException; -import org.olat.search.service.document.file.DocumentNotImplementedException; import org.olat.search.service.document.file.FileDocumentFactory; /** @@ -125,13 +123,13 @@ public class FolderIndexerWorker implements Runnable{ protected void doIndexVFSLeaf(SearchResourceContext leafResourceContext, VFSLeaf leaf, OlatFullIndexer writer, String fPath) { if (log.isDebug()) log.debug("Analyse VFSLeaf=" + leaf.getName()); try { - if (SearchServiceFactory.getFileDocumentFactory().isFileSupported(leaf)) { + if (CoreSpringFactory.getImpl(FileDocumentFactory.class).isFileSupported(leaf)) { String myFilePath = fPath + "/" + leaf.getName(); leafResourceContext.setFilePath(myFilePath); //fxdiff FXOLAT-97: high CPU load tracker WorkThreadInformations.setInfoFiles(myFilePath, leaf); WorkThreadInformations.set("Index VFSLeaf=" + myFilePath + " at " + leafResourceContext.getResourceUrl()); - Document document = FileDocumentFactory.createDocument(leafResourceContext, leaf); + Document document = CoreSpringFactory.getImpl(FileDocumentFactory.class).createDocument(leafResourceContext, leaf); if(document != null) {//document wihich are disabled return null writer.addDocument(document); } @@ -140,12 +138,8 @@ public class FolderIndexerWorker implements Runnable{ } } catch (DocumentAccessException e) { if (log.isDebug()) log.debug("Can not access document." + e.getMessage()); - } catch (DocumentNotImplementedException e) { - if (log.isDebug()) log.debug("Documenttype not implemented." + e.getMessage()); } catch (InterruptedException e) { if (log.isDebug()) log.debug("InterruptedException: Can not index leaf=" + leaf.getName() + ";" + e.getMessage()); - }catch (DocumentException dex) { - log.debug("DocumentException: Can not index leaf=" + leaf.getName() + " , exception=" + dex); } catch (IOException ioEx) { log.warn("IOException: Can not index leaf=" + leaf.getName(), ioEx); } catch (Exception ex) { diff --git a/src/main/java/org/olat/search/service/indexer/FullIndexerStatus.java b/src/main/java/org/olat/search/service/indexer/FullIndexerStatus.java index a079c8447da..6381d19687f 100644 --- a/src/main/java/org/olat/search/service/indexer/FullIndexerStatus.java +++ b/src/main/java/org/olat/search/service/indexer/FullIndexerStatus.java @@ -29,7 +29,8 @@ import java.util.Date; import java.util.Hashtable; import java.util.Map; -import org.olat.search.service.SearchServiceFactory; +import org.olat.core.CoreSpringFactory; +import org.olat.search.service.document.file.FileDocumentFactory; /** * Status of full indexer. @@ -130,7 +131,7 @@ public class FullIndexerStatus { } documentCounters = new Hashtable<String,Integer>(); fileTypeCounters = new Hashtable<String,Integer>(); - SearchServiceFactory.getFileDocumentFactory().resetExcludedFileSizeCount(); + CoreSpringFactory.getImpl(FileDocumentFactory.class).resetExcludedFileSizeCount(); } /** @@ -279,7 +280,7 @@ public class FullIndexerStatus { } public int getExcludedDocumentCount() { - return SearchServiceFactory.getFileDocumentFactory().getExcludedFileSizeCount(); + return CoreSpringFactory.getImpl(FileDocumentFactory.class).getExcludedFileSizeCount(); } } diff --git a/src/main/java/org/olat/search/service/indexer/repository/course/DialogCourseNodeIndexer.java b/src/main/java/org/olat/search/service/indexer/repository/course/DialogCourseNodeIndexer.java index e9236a71aab..a960d2b85a3 100644 --- a/src/main/java/org/olat/search/service/indexer/repository/course/DialogCourseNodeIndexer.java +++ b/src/main/java/org/olat/search/service/indexer/repository/course/DialogCourseNodeIndexer.java @@ -33,6 +33,7 @@ import java.util.List; import org.apache.lucene.document.Document; import org.olat.basesecurity.BaseSecurityManager; import org.olat.basesecurity.Constants; +import org.olat.core.CoreSpringFactory; import org.olat.core.commons.modules.bc.vfs.OlatRootFolderImpl; import org.olat.core.id.Identity; import org.olat.core.id.OLATResourceable; @@ -55,11 +56,8 @@ import org.olat.modules.fo.ForumManager; import org.olat.modules.fo.Message; import org.olat.modules.fo.Status; import org.olat.search.service.SearchResourceContext; -import org.olat.search.service.SearchServiceFactory; import org.olat.search.service.document.ForumMessageDocument; import org.olat.search.service.document.file.DocumentAccessException; -import org.olat.search.service.document.file.DocumentException; -import org.olat.search.service.document.file.DocumentNotImplementedException; import org.olat.search.service.document.file.FileDocumentFactory; import org.olat.search.service.indexer.DefaultIndexer; import org.olat.search.service.indexer.OlatFullIndexer; @@ -120,22 +118,18 @@ public class DialogCourseNodeIndexer extends DefaultIndexer implements CourseNod VFSLeaf leaf = (VFSLeaf) forumContainer.getItems(new VFSLeafFilter()).get(0); if (isLogDebugEnabled()) logDebug("Analyse VFSLeaf=" + leaf.getName()); try { - if (SearchServiceFactory.getFileDocumentFactory().isFileSupported(leaf)) { + if (CoreSpringFactory.getImpl(FileDocumentFactory.class).isFileSupported(leaf)) { leafResourceContext.setFilePath(filename); leafResourceContext.setDocumentType(TYPE_FILE); //fxdiff FXOLAT-97: high CPU load tracker WorkThreadInformations.set("Index Dialog VFSLeaf=" + filename + " at " + leafResourceContext.getResourceUrl()); - Document document = FileDocumentFactory.createDocument(leafResourceContext, leaf); + Document document = CoreSpringFactory.getImpl(FileDocumentFactory.class).createDocument(leafResourceContext, leaf); indexWriter.addDocument(document); } else { if (isLogDebugEnabled()) logDebug("Documenttype not supported. file=" + leaf.getName()); } } catch (DocumentAccessException e) { if (isLogDebugEnabled()) logDebug("Can not access document." + e.getMessage()); - } catch (DocumentNotImplementedException e) { - if (isLogDebugEnabled()) logDebug("Documenttype not implemented."); - } catch (DocumentException dex) { - if (isLogDebugEnabled()) logDebug("DocumentException: Can not index leaf=" + leaf.getName()); } catch (IOException ioEx) { logWarn("IOException: Can not index leaf=" + leaf.getName(), ioEx); } catch (InterruptedException iex) { diff --git a/src/test/java/org/olat/search/service/document/file/FileDocumentFactoryTest.java b/src/test/java/org/olat/search/service/document/file/FileDocumentFactoryTest.java index 0aec7814303..9a713c038a1 100644 --- a/src/test/java/org/olat/search/service/document/file/FileDocumentFactoryTest.java +++ b/src/test/java/org/olat/search/service/document/file/FileDocumentFactoryTest.java @@ -50,8 +50,8 @@ import org.olat.core.util.resource.OresHelper; import org.olat.core.util.vfs.LocalFileImpl; import org.olat.core.util.vfs.VFSLeaf; import org.olat.search.service.SearchResourceContext; -import org.olat.search.service.SearchServiceFactory; import org.olat.test.OlatTestCase; +import org.springframework.beans.factory.annotation.Autowired; /** * Lucene document mapper. @@ -61,7 +61,9 @@ public class FileDocumentFactoryTest extends OlatTestCase { private static Logger log = Logger.getLogger(FileDocumentFactoryTest.class.getName()); // variables for test fixture - FileDocumentFactory fileDocumentFactory; + + @Autowired + private FileDocumentFactory fileDocumentFactory; private String rootPath; /** @@ -70,7 +72,6 @@ public class FileDocumentFactoryTest extends OlatTestCase { @Before public void setup()throws Exception { //clear database from errors rootPath = "/search_junit_test_folder"; - fileDocumentFactory = SearchServiceFactory.getFileDocumentFactory(); } /** @@ -143,7 +144,7 @@ public class FileDocumentFactoryTest extends OlatTestCase { SearchResourceContext resourceContext = new SearchResourceContext(); resourceContext.setBusinessControlFor(OresHelper.createOLATResourceableType("FileDocumentFactoryTest")); resourceContext.setFilePath(filePath + "/" + leaf.getName()); - Document htmlDocument = FileDocumentFactory.createDocument(resourceContext, leaf); + Document htmlDocument = fileDocumentFactory.createDocument(resourceContext, leaf); // 1. Check content String content = htmlDocument.get(OlatDocument.CONTENT_FIELD_NAME); assertEquals("Wrong HTML content=" + content.trim() + " , must be =" + text.trim(), text.trim(), content.trim()); @@ -154,12 +155,8 @@ public class FileDocumentFactoryTest extends OlatTestCase { String fileType = htmlDocument.get(OlatDocument.FILETYPE_FIELD_NAME); assertEquals("Wrong file-type", "type.file.html", fileType); - } catch (DocumentNotImplementedException e) { - fail("DocumentNotImplementedException=" + e.getMessage()); } catch (IOException e) { fail("IOException=" + e.getMessage()); - } catch (DocumentException e) { - fail("DocumentException=" + e.getMessage()); } catch (DocumentAccessException e) { fail("DocumentAccessException=" + e.getMessage()); } -- GitLab