From cd94da695d09f5722c3de15361e1b3c2dbad5f24 Mon Sep 17 00:00:00 2001 From: srosse <none@none> Date: Wed, 24 Feb 2016 09:20:37 +0100 Subject: [PATCH] OO-1909: implement a fallback for old word format --- .../service/document/file/WordDocument.java | 31 +++++++++++++------ 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/olat/search/service/document/file/WordDocument.java b/src/main/java/org/olat/search/service/document/file/WordDocument.java index 8267b752b6b..23789be3b87 100644 --- a/src/main/java/org/olat/search/service/document/file/WordDocument.java +++ b/src/main/java/org/olat/search/service/document/file/WordDocument.java @@ -27,10 +27,14 @@ package org.olat.search.service.document.file; import java.io.BufferedInputStream; import java.io.IOException; +import java.io.InputStream; import java.io.Writer; import java.util.Iterator; import org.apache.lucene.document.Document; +import org.apache.poi.hwpf.HWPFOldDocument; +import org.apache.poi.hwpf.OldWordFileFormatException; +import org.apache.poi.hwpf.extractor.Word6Extractor; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.poifs.filesystem.DocumentEntry; import org.apache.poi.poifs.filesystem.Entry; @@ -71,10 +75,8 @@ public class WordDocument extends FileDocument { @Override protected FileContent readContent(VFSLeaf leaf) throws IOException, DocumentException { - BufferedInputStream bis = null; LimitedContentWriter sb = new LimitedContentWriter((int)leaf.getSize(), FileDocumentFactory.getMaxFileSize()); - try { - bis = new BufferedInputStream(leaf.getInputStream()); + try(InputStream bis = new BufferedInputStream(leaf.getInputStream())) { POIFSFileSystem filesystem = new POIFSFileSystem(bis); Iterator<?> entries = filesystem.getRoot().getEntries(); while (entries.hasNext()) { @@ -83,7 +85,7 @@ public class WordDocument extends FileDocument { if (!(entry instanceof DocumentEntry)) { // Skip directory entries } else if ("WordDocument".equals(name)) { - collectWordDocument(filesystem, sb); + collectWordDocument(leaf, filesystem, sb); } } return new FileContent(sb.toString()); @@ -91,18 +93,27 @@ public class WordDocument extends FileDocument { log.warn("could not read in word document: " + leaf + " please check, that this is not an docx/rtf/html file!"); throw new DocumentException(e.getMessage()); - } finally { - if (bis != null) { - bis.close(); - } } } - private void collectWordDocument(POIFSFileSystem filesystem, Writer sb) throws IOException { + private void collectWordDocument(VFSLeaf leaf, POIFSFileSystem filesystem, Writer sb) throws IOException { try(WordExtractor extractor = new WordExtractor(filesystem)) { addTextIfAny(sb, extractor.getTextFromPieces()); + } catch(OldWordFileFormatException ex) { + collectOldWordDocument(leaf, sb); + } catch(Exception e) { + log.error("Cannot read word document: " + leaf, e); + } + } + + private void collectOldWordDocument(VFSLeaf leaf, Writer sb) throws IOException { + try(InputStream bis = new BufferedInputStream(leaf.getInputStream())) { + POIFSFileSystem pfs = new POIFSFileSystem(bis); + HWPFOldDocument doc = new HWPFOldDocument(pfs); + Word6Extractor docExtractor = new Word6Extractor(doc); + addTextIfAny(sb, docExtractor.getText()); } catch(Exception e) { - log.error("", e); + log.error("Cannot read old word document: " + leaf, e); } } -- GitLab