From 3071feec786afec8a297077c6d5c7c5755b5a3ea Mon Sep 17 00:00:00 2001 From: srosse <none@none> Date: Wed, 22 Apr 2015 14:29:08 +0200 Subject: [PATCH] OO-1522: limit the maximum size of the indexed content by document, use the maxFileSize as variable --- .../core/util/filter/impl/NekoHTMLFilter.java | 32 ++-- .../core/util/io/LimitedContentWriter.java | 110 ++++++++++++++ .../olat/core/util/io/ShieldInputStream.java | 5 +- .../org/olat/core/util/vfs/JavaIOItem.java | 34 +++++ .../org/olat/core/util/vfs/LocalImpl.java | 2 +- .../org/olat/search/model/OlatDocument.java | 53 ++++--- .../AbstractOfficeDocumentComparator.java | 72 +++++++++ .../service/document/file/ExcelDocument.java | 10 +- .../document/file/ExcelOOXMLDocument.java | 63 +++----- .../service/document/file/FileContent.java | 51 +++++++ .../service/document/file/FileDocument.java | 21 --- .../document/file/FileDocumentFactory.java | 4 + .../service/document/file/HtmlDocument.java | 28 ++-- .../service/document/file/OpenDocument.java | 18 +-- .../service/document/file/PdfDocument.java | 47 +++--- .../document/file/PowerPointDocument.java | 129 ++++++++-------- .../file/PowerPointOOXMLDocument.java | 92 ++++++++---- .../service/document/file/TextDocument.java | 99 ++++++------ .../service/document/file/WordDocument.java | 141 +++++++++--------- .../document/file/WordOOXMLDocument.java | 127 +++++++++------- .../document/file/pdf/PdfBoxExtractor.java | 31 ++-- .../document/file/utils/SlicedDocument.java | 87 ----------- .../file/FileDocumentFactoryTest.java | 7 +- .../document/file/OfficeDocumentTest.java | 55 ++++++- .../document/file/PDFDocumentTest.java | 17 ++- .../java/org/olat/test/VFSJavaIOFile.java | 8 +- 26 files changed, 815 insertions(+), 528 deletions(-) create mode 100644 src/main/java/org/olat/core/util/io/LimitedContentWriter.java create mode 100644 src/main/java/org/olat/core/util/vfs/JavaIOItem.java create mode 100644 src/main/java/org/olat/search/service/document/file/AbstractOfficeDocumentComparator.java create mode 100644 src/main/java/org/olat/search/service/document/file/FileContent.java delete mode 100644 src/main/java/org/olat/search/service/document/file/utils/SlicedDocument.java diff --git a/src/main/java/org/olat/core/util/filter/impl/NekoHTMLFilter.java b/src/main/java/org/olat/core/util/filter/impl/NekoHTMLFilter.java index bead7cb6601..3370d64a250 100644 --- a/src/main/java/org/olat/core/util/filter/impl/NekoHTMLFilter.java +++ b/src/main/java/org/olat/core/util/filter/impl/NekoHTMLFilter.java @@ -29,6 +29,8 @@ import java.util.Set; import org.cyberneko.html.parsers.SAXParser; import org.olat.core.logging.LogDelegator; import org.olat.core.util.filter.Filter; +import org.olat.core.util.io.LimitedContentWriter; +import org.olat.search.service.document.file.FileDocumentFactory; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; @@ -97,9 +99,9 @@ public class NekoHTMLFilter extends LogDelegator implements Filter { public static class NekoContent { private final String title; - private final String content; + private final LimitedContentWriter content; - public NekoContent(String title, String content) { + public NekoContent(String title, LimitedContentWriter content) { this.title = title; this.content = content; } @@ -109,7 +111,7 @@ public class NekoHTMLFilter extends LogDelegator implements Filter { } public String getContent() { - return content; + return content.toString(); } } @@ -118,12 +120,12 @@ public class NekoHTMLFilter extends LogDelegator implements Filter { private boolean consumeBlanck = false; private boolean consumeTitle = true; private final boolean pretty; - private final StringBuilder sb; + private final LimitedContentWriter content; private final StringBuilder title; public HTMLHandler(int size, boolean pretty) { this.pretty = pretty; - sb = new StringBuilder(size); + content = new LimitedContentWriter(size, FileDocumentFactory.getMaxFileSize()); title = new StringBuilder(32); } @@ -136,15 +138,15 @@ public class NekoHTMLFilter extends LogDelegator implements Filter { } else { if(pretty) { if("li".equals(elem)) { - sb.append("\u00B7 "); + content.append("\u00B7 "); } else if("br".equals(elem)) { - sb.append('\n'); + content.append('\n'); } } if("title".equals(elem)) { consumeTitle = true; } - if(blockTags.contains(elem) && sb.length() > 0 && sb.charAt(sb.length() -1) != ' ' ) { + if(blockTags.contains(elem) && content.length() > 0 && content.charAt(content.length() -1) != ' ' ) { consumeBlanck = true; } } @@ -154,12 +156,12 @@ public class NekoHTMLFilter extends LogDelegator implements Filter { public void characters(char[] chars, int offset, int length) { if(collect) { if(consumeBlanck) { - if(sb.length() > 0 && sb.charAt(sb.length() -1) != ' ' && length > 0 && chars[offset] != ' ') { - sb.append(' '); + if(content.length() > 0 && content.charAt(content.length() -1) != ' ' && length > 0 && chars[offset] != ' ') { + content.append(' '); } consumeBlanck = false; } - sb.append(chars, offset, length); + content.write(chars, offset, length); if(consumeTitle) { title.append(chars, offset, length); } @@ -173,24 +175,24 @@ public class NekoHTMLFilter extends LogDelegator implements Filter { collect = true; } else { if(pretty && ("li".equals(elem) || "p".equals(elem))) { - sb.append('\n'); + content.append('\n'); } if("title".equals(elem)) { consumeTitle = false; } - if(blockTags.contains(elem) && sb.length() > 0 && sb.charAt(sb.length() -1) != ' ' ) { + if(blockTags.contains(elem) && content.length() > 0 && content.charAt(content.length() -1) != ' ' ) { consumeBlanck = true; } } } public NekoContent getContent() { - return new NekoContent(title.toString(), sb.toString()); + return new NekoContent(title.toString(), content); } @Override public String toString() { - return sb.toString(); + return content.toString(); } } } diff --git a/src/main/java/org/olat/core/util/io/LimitedContentWriter.java b/src/main/java/org/olat/core/util/io/LimitedContentWriter.java new file mode 100644 index 00000000000..b81fed8387e --- /dev/null +++ b/src/main/java/org/olat/core/util/io/LimitedContentWriter.java @@ -0,0 +1,110 @@ +/** + * <a href="http://www.openolat.org"> + * OpenOLAT - Online Learning and Training</a><br> + * <p> + * Licensed under the Apache License, Version 2.0 (the "License"); <br> + * you may not use this file except in compliance with the License.<br> + * You may obtain a copy of the License at the + * <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache homepage</a> + * <p> + * Unless required by applicable law or agreed to in writing,<br> + * software distributed under the License is distributed on an "AS IS" BASIS, <br> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br> + * See the License for the specific language governing permissions and <br> + * limitations under the License. + * <p> + * Initial code contributed and copyrighted by<br> + * frentix GmbH, http://www.frentix.com + * <p> + */ +package org.olat.core.util.io; + +import java.io.Writer; + +/** + * + * Initial date: 21.04.2015<br> + * @author srosse, stephane.rosse@frentix.com, http://www.frentix.com + * + */ +public class LimitedContentWriter extends Writer { + + private final StringBuilder sb; + private final int maxSize; + + /** + * @param len + */ + public LimitedContentWriter(int len, int maxSize) { + sb = new StringBuilder(Math.min(len, maxSize)); + this.maxSize = maxSize; + } + + protected LimitedContentWriter(int len) { + this(len, Integer.MAX_VALUE); + } + + @Override + public void write(char[] cbuf, int off, int len) { + if(accept() && len + sb.length() < maxSize) { + sb.append(cbuf, off, len); + } + } + + @Override + public Writer append(CharSequence seq, int start, int end) { + if(accept()) { + if((end - start) + sb.length() < maxSize) { + sb.append(seq, start, end); + } else { + sb.append(seq, start, start + (maxSize - sb.length())); + } + } + return this; + } + + @Override + public Writer append(CharSequence csq) { + if(accept()) { + if(csq.length() + sb.length() < maxSize) { + sb.append(csq); + } else { + sb.append(csq, 0, maxSize - sb.length()); + } + } + return this; + } + + @Override + public Writer append(char c) { + if(accept()) sb.append(c); + return this; + } + + public final boolean accept() { + return sb.length() < maxSize; + } + + @Override + public void flush() { + // + } + + @Override + public void close() { + // + } + + public int length() { + return sb.length(); + } + + public char charAt(int index) { + return sb.charAt(index); + } + + @Override + public String toString() { + return sb.toString(); + } +} diff --git a/src/main/java/org/olat/core/util/io/ShieldInputStream.java b/src/main/java/org/olat/core/util/io/ShieldInputStream.java index 9b9914bd8ed..117b9d3f279 100644 --- a/src/main/java/org/olat/core/util/io/ShieldInputStream.java +++ b/src/main/java/org/olat/core/util/io/ShieldInputStream.java @@ -21,7 +21,6 @@ package org.olat.core.util.io; import java.io.IOException; import java.io.InputStream; -import java.util.zip.ZipInputStream; /** * It's a wrapper for a ZIP input stream which MUST not be closed @@ -33,9 +32,9 @@ import java.util.zip.ZipInputStream; * @author srosse, stephane.rosse@frentix.com, http://www.frentix */ public class ShieldInputStream extends InputStream { - private final ZipInputStream delegate; + private final InputStream delegate; - public ShieldInputStream(ZipInputStream delegate) { + public ShieldInputStream(InputStream delegate) { this.delegate = delegate; } diff --git a/src/main/java/org/olat/core/util/vfs/JavaIOItem.java b/src/main/java/org/olat/core/util/vfs/JavaIOItem.java new file mode 100644 index 00000000000..9aa1223d9e0 --- /dev/null +++ b/src/main/java/org/olat/core/util/vfs/JavaIOItem.java @@ -0,0 +1,34 @@ +/** + * <a href="http://www.openolat.org"> + * OpenOLAT - Online Learning and Training</a><br> + * <p> + * Licensed under the Apache License, Version 2.0 (the "License"); <br> + * you may not use this file except in compliance with the License.<br> + * You may obtain a copy of the License at the + * <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache homepage</a> + * <p> + * Unless required by applicable law or agreed to in writing,<br> + * software distributed under the License is distributed on an "AS IS" BASIS, <br> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br> + * See the License for the specific language governing permissions and <br> + * limitations under the License. + * <p> + * Initial code contributed and copyrighted by<br> + * frentix GmbH, http://www.frentix.com + * <p> + */ +package org.olat.core.util.vfs; + +import java.io.File; + +/** + * + * Initial date: 22.04.2015<br> + * @author srosse, stephane.rosse@frentix.com, http://www.frentix.com + * + */ +public interface JavaIOItem { + + public File getBasefile(); + +} diff --git a/src/main/java/org/olat/core/util/vfs/LocalImpl.java b/src/main/java/org/olat/core/util/vfs/LocalImpl.java index 1ceab961103..48ad07e9433 100644 --- a/src/main/java/org/olat/core/util/vfs/LocalImpl.java +++ b/src/main/java/org/olat/core/util/vfs/LocalImpl.java @@ -39,7 +39,7 @@ import org.olat.core.util.vfs.callbacks.VFSSecurityCallback; * * @author Felix Jost */ -public abstract class LocalImpl implements VFSItem { +public abstract class LocalImpl implements VFSItem, JavaIOItem { private File basefile; private VFSContainer parentContainer; diff --git a/src/main/java/org/olat/search/model/OlatDocument.java b/src/main/java/org/olat/search/model/OlatDocument.java index 7a2472f421c..ee790ad0c1b 100644 --- a/src/main/java/org/olat/search/model/OlatDocument.java +++ b/src/main/java/org/olat/search/model/OlatDocument.java @@ -25,6 +25,7 @@ package org.olat.search.model; +import java.util.Date; import java.util.List; import java.util.Map.Entry; import java.util.StringTokenizer; @@ -85,7 +86,7 @@ public class OlatDocument extends AbstractOlatDocument { } document.add(createTextField(TITLE_FIELD_NAME,getTitle(), 4)); document.add(createTextField(DESCRIPTION_FIELD_NAME,getDescription(), 2)); - document.add(createTextField(CONTENT_FIELD_NAME,getContent(), 0.5f ) ); + document.add(createTextField(CONTENT_FIELD_NAME, getContent(), 0.5f)); document.add(new StringField(RESOURCEURL_FIELD_NAME, getResourceUrl(), Field.Store.YES)); document.add(new StringField(DOCUMENTTYPE_FIELD_NAME,getDocumentType(), Field.Store.YES)); if(getCssIcon() != null) { @@ -95,26 +96,26 @@ public class OlatDocument extends AbstractOlatDocument { document.add(createTextField(AUTHOR_FIELD_NAME,getAuthor(), 2)); try { - if(getCreatedDate() != null) { - document.add(new StringField(CREATED_FIELD_NAME, DateTools.dateToString(getCreatedDate(), DateTools.Resolution.DAY), Field.Store.YES) ); - } - }catch (Exception ex) { - // No createdDate set => does not add field - } - try { - if(getLastChange() != null) { - document.add(new StringField(CHANGED_FIELD_NAME, DateTools.dateToString(getLastChange(), DateTools.Resolution.DAY), Field.Store.YES) ); - } - }catch (Exception ex) { - // No changedDate set => does not add field - } - try { - if(getTimestamp() != null) { - document.add(new StringField(TIME_STAMP_NAME, DateTools.dateToString(getTimestamp(), DateTools.Resolution.MILLISECOND), Field.Store.YES) ); - } - }catch (Exception ex) { - // No changedDate set => does not add field - } + if(getCreatedDate() != null) { + document.add(createDayField(CREATED_FIELD_NAME, getCreatedDate())); + } + } catch (Exception ex) { + // No createdDate set => does not add field + } + try { + if (getLastChange() != null) { + document.add(createDayField(CHANGED_FIELD_NAME, getLastChange())); + } + } catch (Exception ex) { + // No changedDate set => does not add field + } + try { + if (getTimestamp() != null) { + document.add(createMillisecondField(TIME_STAMP_NAME, getTimestamp())); + } + } catch (Exception ex) { + // No changedDate set => does not add field + } // Add various metadata if (metadata != null) { @@ -136,7 +137,7 @@ public class OlatDocument extends AbstractOlatDocument { } else { document.add(new StringField(RESERVED_TO, "public", Field.Store.YES)); } - return document; + return document; } /** @@ -151,4 +152,12 @@ public class OlatDocument extends AbstractOlatDocument { field.setBoost(boost); return field; } + + protected static Field createDayField(String fieldName, Date date) { + return new StringField(fieldName, DateTools.dateToString(date, DateTools.Resolution.DAY), Field.Store.YES); + } + + protected static Field createMillisecondField(String fieldName, Date date) { + return new StringField(fieldName, DateTools.dateToString(date, DateTools.Resolution.MILLISECOND), Field.Store.YES); + } } diff --git a/src/main/java/org/olat/search/service/document/file/AbstractOfficeDocumentComparator.java b/src/main/java/org/olat/search/service/document/file/AbstractOfficeDocumentComparator.java new file mode 100644 index 00000000000..08e7dc53fdd --- /dev/null +++ b/src/main/java/org/olat/search/service/document/file/AbstractOfficeDocumentComparator.java @@ -0,0 +1,72 @@ +/** + * <a href="http://www.openolat.org"> + * OpenOLAT - Online Learning and Training</a><br> + * <p> + * Licensed under the Apache License, Version 2.0 (the "License"); <br> + * you may not use this file except in compliance with the License.<br> + * You may obtain a copy of the License at the + * <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache homepage</a> + * <p> + * Unless required by applicable law or agreed to in writing,<br> + * software distributed under the License is distributed on an "AS IS" BASIS, <br> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br> + * See the License for the specific language governing permissions and <br> + * limitations under the License. + * <p> + * Initial code contributed and copyrighted by<br> + * frentix GmbH, http://www.frentix.com + * <p> + */ +package org.olat.search.service.document.file; + +import java.util.Comparator; + +import org.olat.core.util.StringHelper; + +/** + * + * Initial date: 22.04.2015<br> + * @author srosse, stephane.rosse@frentix.com, http://www.frentix.com + * + */ +public abstract class AbstractOfficeDocumentComparator implements Comparator<String> { + + protected int comparePosition(String f1, String f2, String prefix) { + boolean l1; + String p1; + if(f1.length() > prefix.length() && f1.endsWith(".xml")) { + p1 = f1.substring(prefix.length(), f1.indexOf(".xml")); + l1 = StringHelper.isLong(p1); + } else { + p1 = null; + l1 = false; + } + + boolean l2; + String p2; + if(f2.length() > prefix.length() && f2.endsWith(".xml")) { + p2 = f2.substring(prefix.length(), f2.indexOf(".xml")); + l2 = StringHelper.isLong(p2); + } else { + p2 = null; + l2 = false; + } + + int c = 0; + if(l1 && l2) { + try { + Long pl1 = Long.parseLong(p1); + long pl2 = Long.parseLong(p2); + return (int) (pl2 - pl1); + } catch (NumberFormatException e) { + //can happen + } + } else if(l1) { + return -1; + } else if(l2) { + return 1; + } + return c; + } + +} diff --git a/src/main/java/org/olat/search/service/document/file/ExcelDocument.java b/src/main/java/org/olat/search/service/document/file/ExcelDocument.java index 39c36c012ff..7dc32d92c8d 100644 --- a/src/main/java/org/olat/search/service/document/file/ExcelDocument.java +++ b/src/main/java/org/olat/search/service/document/file/ExcelDocument.java @@ -38,6 +38,7 @@ import org.apache.poi.ss.usermodel.Cell; import org.olat.core.gui.util.CSSHelper; import org.olat.core.logging.OLog; import org.olat.core.logging.Tracing; +import org.olat.core.util.io.LimitedContentWriter; import org.olat.core.util.vfs.VFSLeaf; import org.olat.search.service.SearchResourceContext; @@ -56,8 +57,8 @@ public class ExcelDocument extends FileDocument { // } - public static Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) throws IOException, DocumentException, - DocumentAccessException { + public static Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) + throws IOException, DocumentException, DocumentAccessException { ExcelDocument excelDocument = new ExcelDocument(); excelDocument.init(leafResourceContext, leaf); excelDocument.setFileType(FILE_TYPE); @@ -66,6 +67,7 @@ public class ExcelDocument extends FileDocument { return excelDocument.getLuceneDocument(); } + @Override protected FileContent readContent(VFSLeaf leaf) throws IOException, DocumentException { BufferedInputStream bis = null; int cellNullCounter = 0; @@ -74,7 +76,8 @@ public class ExcelDocument extends FileDocument { try { bis = new BufferedInputStream(leaf.getInputStream()); - StringBuilder content = new StringBuilder(bis.available()); + + LimitedContentWriter content = new LimitedContentWriter(bis.available(), FileDocumentFactory.getMaxFileSize()); POIFSFileSystem fs = new POIFSFileSystem(bis); HSSFWorkbook workbook = new HSSFWorkbook(fs); @@ -110,6 +113,7 @@ public class ExcelDocument extends FileDocument { + sheetNullCounter); } } + content.close(); return new FileContent(content.toString()); } catch (Exception ex) { throw new DocumentException("Can not read XLS Content. File=" + leaf.getName()); diff --git a/src/main/java/org/olat/search/service/document/file/ExcelOOXMLDocument.java b/src/main/java/org/olat/search/service/document/file/ExcelOOXMLDocument.java index 29f7d7b872d..f1d7baa730d 100644 --- a/src/main/java/org/olat/search/service/document/file/ExcelOOXMLDocument.java +++ b/src/main/java/org/olat/search/service/document/file/ExcelOOXMLDocument.java @@ -30,11 +30,10 @@ import org.apache.lucene.document.Document; import org.olat.core.gui.util.CSSHelper; import org.olat.core.logging.OLog; import org.olat.core.logging.Tracing; -import org.olat.core.util.FileUtils; +import org.olat.core.util.io.LimitedContentWriter; import org.olat.core.util.io.ShieldInputStream; import org.olat.core.util.vfs.VFSLeaf; import org.olat.search.service.SearchResourceContext; -import org.olat.search.service.document.file.utils.SlicedDocument; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; @@ -77,50 +76,38 @@ public class ExcelOOXMLDocument extends FileDocument { Map<String,String> sharedStrings = parseSharedStrings(leaf); //parse sheets String content = parseSheets(sharedStrings, leaf); - return new FileContent(content); } - private String parseSheets(Map<String,String> sharedStrings, VFSLeaf leaf) throws IOException, DocumentException { - InputStream stream = null; - ZipInputStream zip = null; - try { - stream = leaf.getInputStream(); - zip = new ZipInputStream(stream); - ZipEntry entry = zip.getNextEntry(); + try(InputStream stream = leaf.getInputStream(); + ZipInputStream zip = new ZipInputStream(stream)) { - SlicedDocument doc = new SlicedDocument(); + ZipEntry entry = zip.getNextEntry(); + LimitedContentWriter writer = new LimitedContentWriter(100000, FileDocumentFactory.getMaxFileSize()); while (entry != null) { - String name = entry.getName(); - if(name.startsWith(SHEET) && name.endsWith(".xml")) { - String position = name.substring(SHEET.length(), name.indexOf(".xml")); - - OfficeDocumentHandler dh = new OfficeDocumentHandler(sharedStrings); - parse(new ShieldInputStream(zip), dh); - doc.setContent(Integer.parseInt(position), dh.getContent()); + if(writer.accept()) { + String name = entry.getName(); + if(name.startsWith(SHEET) && name.endsWith(".xml")) { + OfficeDocumentHandler dh = new OfficeDocumentHandler(writer, sharedStrings); + parse(new ShieldInputStream(zip), dh); + } } entry = zip.getNextEntry(); } - return doc.toStringAndClear(); + return writer.toString(); } catch (DocumentException e) { throw e; } catch (Exception e) { throw new DocumentException(e.getMessage()); - } finally { - FileUtils.closeSafely(zip); - FileUtils.closeSafely(stream); } } private Map<String,String> parseSharedStrings( VFSLeaf leaf) throws IOException, DocumentException { SharedStringsHandler dh = new SharedStringsHandler(); - - InputStream stream = null; - ZipInputStream zip = null; - try { - stream = leaf.getInputStream(); - zip = new ZipInputStream(stream); + try(InputStream stream = leaf.getInputStream(); + ZipInputStream zip = new ZipInputStream(stream)) { + ZipEntry entry = zip.getNextEntry(); while (entry != null) { String name = entry.getName(); @@ -135,9 +122,6 @@ public class ExcelOOXMLDocument extends FileDocument { throw e; } catch (Exception e) { throw new DocumentException(e.getMessage()); - } finally { - FileUtils.closeSafely(zip); - FileUtils.closeSafely(stream); } } @@ -147,7 +131,7 @@ public class ExcelOOXMLDocument extends FileDocument { parser.setContentHandler(handler); parser.setEntityResolver(handler); try { - parser.setFeature("http://xml.org/sax/features/validation", false); + parser.setFeature("http://xml.org/sax/features/validation", false); } catch(Exception e) { log.error("Cannot deactivate validation", e); } @@ -157,21 +141,18 @@ public class ExcelOOXMLDocument extends FileDocument { } } - private class OfficeDocumentHandler extends DefaultHandler { - private final StringBuilder sb = new StringBuilder(); + private static class OfficeDocumentHandler extends DefaultHandler { private boolean row = false; private boolean sharedStrings = false; - private Map<String,String> strings; + private final Map<String,String> strings; + private final LimitedContentWriter sb; - public OfficeDocumentHandler(Map<String,String> strings) { + public OfficeDocumentHandler(LimitedContentWriter sb, Map<String,String> strings) { + this.sb = sb; this.strings = strings; } - public StringBuilder getContent() { - return sb; - } - @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { if("row".equals(qName)) { @@ -211,7 +192,7 @@ public class ExcelOOXMLDocument extends FileDocument { if(sb .length() > 0 && sb.charAt(sb.length() - 1) != ' '){ sb.append(' '); } - sb.append(ch, start, length); + sb.write(ch, start, length); } } } diff --git a/src/main/java/org/olat/search/service/document/file/FileContent.java b/src/main/java/org/olat/search/service/document/file/FileContent.java new file mode 100644 index 00000000000..14b0eb2b4bb --- /dev/null +++ b/src/main/java/org/olat/search/service/document/file/FileContent.java @@ -0,0 +1,51 @@ +/** + * <a href="http://www.openolat.org"> + * OpenOLAT - Online Learning and Training</a><br> + * <p> + * Licensed under the Apache License, Version 2.0 (the "License"); <br> + * you may not use this file except in compliance with the License.<br> + * You may obtain a copy of the License at the + * <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache homepage</a> + * <p> + * Unless required by applicable law or agreed to in writing,<br> + * software distributed under the License is distributed on an "AS IS" BASIS, <br> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br> + * See the License for the specific language governing permissions and <br> + * limitations under the License. + * <p> + * Initial code contributed and copyrighted by<br> + * frentix GmbH, http://www.frentix.com + * <p> + */ +package org.olat.search.service.document.file; + + +/** + * + * Initial date: 08.04.2015<br> + * @author srosse, stephane.rosse@frentix.com, http://www.frentix.com + * + */ +public class FileContent { + + private final String title; + private final String content; + + public FileContent(String content) { + this(null, content); + } + + public FileContent(String title, String content) { + this.title = title; + this.content = content; + } + + public String getTitle() { + return title; + } + + public String getContent() { + return content; + } + +} diff --git a/src/main/java/org/olat/search/service/document/file/FileDocument.java b/src/main/java/org/olat/search/service/document/file/FileDocument.java index cd95ddcf678..f3f48312880 100644 --- a/src/main/java/org/olat/search/service/document/file/FileDocument.java +++ b/src/main/java/org/olat/search/service/document/file/FileDocument.java @@ -130,25 +130,4 @@ public abstract class FileDocument extends OlatDocument { abstract protected FileContent readContent(VFSLeaf leaf) throws IOException, DocumentException, DocumentAccessException; - public static class FileContent { - private final String title; - private final String content; - - public FileContent(String content) { - this(null, content); - } - - public FileContent(String title, String content) { - this.title = title; - this.content = content; - } - - public String getTitle() { - return title; - } - - public String getContent() { - return content; - } - } } diff --git a/src/main/java/org/olat/search/service/document/file/FileDocumentFactory.java b/src/main/java/org/olat/search/service/document/file/FileDocumentFactory.java index 219ac0c8d91..97cf3e2cd67 100644 --- a/src/main/java/org/olat/search/service/document/file/FileDocumentFactory.java +++ b/src/main/java/org/olat/search/service/document/file/FileDocumentFactory.java @@ -85,6 +85,10 @@ public class FileDocumentFactory { searchModule = module; } + public static int getMaxFileSize() { + return (int)searchModule.getMaxFileSize(); + } + public Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) throws IOException, DocumentAccessException { try { diff --git a/src/main/java/org/olat/search/service/document/file/HtmlDocument.java b/src/main/java/org/olat/search/service/document/file/HtmlDocument.java index c2f3b62f8dc..0aa234d4be4 100644 --- a/src/main/java/org/olat/search/service/document/file/HtmlDocument.java +++ b/src/main/java/org/olat/search/service/document/file/HtmlDocument.java @@ -32,7 +32,6 @@ import org.apache.lucene.document.Document; import org.olat.core.gui.util.CSSHelper; import org.olat.core.logging.OLog; import org.olat.core.logging.Tracing; -import org.olat.core.util.FileUtils; import org.olat.core.util.filter.impl.NekoHTMLFilter; import org.olat.core.util.filter.impl.NekoHTMLFilter.NekoContent; import org.olat.core.util.vfs.VFSLeaf; @@ -53,20 +52,25 @@ public class HtmlDocument extends FileDocument { } public static Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) throws IOException,DocumentException,DocumentAccessException { - HtmlDocument htmlDocument = new HtmlDocument(); - htmlDocument.init(leafResourceContext,leaf); - htmlDocument.setFileType(FILE_TYPE); - htmlDocument.setCssIcon(CSSHelper.createFiletypeIconCssClassFor(leaf.getName())); + HtmlDocument htmlDocument = new HtmlDocument(); + htmlDocument.init(leafResourceContext,leaf); + htmlDocument.setFileType(FILE_TYPE); + htmlDocument.setCssIcon(CSSHelper.createFiletypeIconCssClassFor(leaf.getName())); if (log.isDebug() ) log.debug(htmlDocument.toString()); return htmlDocument.getLuceneDocument(); } - protected FileContent readContent(VFSLeaf leaf) { - InputStream is = leaf.getInputStream(); - // Remove all HTML and Tags - NekoContent output = new NekoHTMLFilter().filter(is); - if (log.isDebug() ) log.debug("HTML content without tags :" + output); - FileUtils.closeSafely(is); - return new FileContent(output.getTitle(), output.getContent()); + @Override + protected FileContent readContent(VFSLeaf leaf) throws DocumentException { + try(InputStream is = leaf.getInputStream()) { + // Remove all HTML and Tags + NekoContent output = new NekoHTMLFilter().filter(is); + if (log.isDebug()) + log.debug("HTML content without tags :" + output); + + return new FileContent(output.getTitle(), output.getContent()); + } catch(Exception e) { + throw new DocumentException(e.getMessage()); + } } } diff --git a/src/main/java/org/olat/search/service/document/file/OpenDocument.java b/src/main/java/org/olat/search/service/document/file/OpenDocument.java index 5dc2ae635e9..91f59ec4a18 100644 --- a/src/main/java/org/olat/search/service/document/file/OpenDocument.java +++ b/src/main/java/org/olat/search/service/document/file/OpenDocument.java @@ -30,6 +30,7 @@ import org.olat.core.gui.util.CSSHelper; import org.olat.core.logging.OLog; import org.olat.core.logging.Tracing; import org.olat.core.util.FileUtils; +import org.olat.core.util.io.LimitedContentWriter; import org.olat.core.util.io.ShieldInputStream; import org.olat.core.util.vfs.VFSLeaf; import org.olat.search.service.SearchResourceContext; @@ -103,7 +104,7 @@ public class OpenDocument extends FileDocument { FileUtils.closeSafely(zip); FileUtils.closeSafely(stream); } - return new FileContent(dh.getContent()); + return new FileContent(dh.toString()); } private void parse(InputStream stream, DefaultHandler handler) throws DocumentException { @@ -112,9 +113,8 @@ public class OpenDocument extends FileDocument { parser.setContentHandler(handler); parser.setEntityResolver(handler); try { - parser.setFeature("http://xml.org/sax/features/validation", false); + parser.setFeature("http://xml.org/sax/features/validation", false); } catch(Exception e) { - e.printStackTrace(); //cannot desactivate validation } parser.parse(new InputSource(stream)); @@ -125,15 +125,7 @@ public class OpenDocument extends FileDocument { private class OpenDocumentHandler extends DefaultHandler { - private final StringBuilder sb = new StringBuilder(); - - public OpenDocumentHandler() { - // - } - - public String getContent() { - return sb.toString(); - } + private final LimitedContentWriter sb = new LimitedContentWriter(5000, FileDocumentFactory.getMaxFileSize()); @Override public InputSource resolveEntity(String publicId, String systemId) { @@ -150,7 +142,7 @@ public class OpenDocument extends FileDocument { if(sb .length() > 0 && sb.charAt(sb.length() - 1) != ' '){ sb.append(' '); } - sb.append(ch, start, length); + sb.write(ch, start, length); } } } diff --git a/src/main/java/org/olat/search/service/document/file/PdfDocument.java b/src/main/java/org/olat/search/service/document/file/PdfDocument.java index 65be5390978..ad174712e23 100644 --- a/src/main/java/org/olat/search/service/document/file/PdfDocument.java +++ b/src/main/java/org/olat/search/service/document/file/PdfDocument.java @@ -25,9 +25,9 @@ package org.olat.search.service.document.file; -import java.io.BufferedInputStream; +import java.io.BufferedReader; import java.io.File; -import java.io.FileInputStream; +import java.io.FileReader; import java.io.IOException; import org.apache.lucene.document.Document; @@ -35,7 +35,7 @@ import org.olat.core.CoreSpringFactory; import org.olat.core.gui.util.CSSHelper; import org.olat.core.logging.OLog; import org.olat.core.logging.Tracing; -import org.olat.core.util.FileUtils; +import org.olat.core.util.io.LimitedContentWriter; import org.olat.core.util.vfs.VFSLeaf; import org.olat.search.service.SearchResourceContext; import org.olat.search.service.SearchServiceFactory; @@ -48,11 +48,11 @@ import org.olat.search.service.document.file.pdf.PdfExtractor; */ public class PdfDocument extends FileDocument { private static final long serialVersionUID = 6432923202585881794L; - private static OLog log = Tracing.createLoggerFor(PdfDocument.class); + private static final OLog log = Tracing.createLoggerFor(PdfDocument.class); public final static String FILE_TYPE = "type.file.pdf"; - private boolean externalIndexer; + private boolean externalIndexer; private String pdfTextBufferPath; private String filePath; @@ -119,21 +119,32 @@ public class PdfDocument extends FileDocument { private FileContent getPdfTextFromBuffer(File pdfTextFile) throws IOException { if (log.isDebug()) log.debug("readContent from text file start..."); - BufferedInputStream bis = null; - try { - bis = new BufferedInputStream(new FileInputStream(pdfTextFile)); - String text = FileUtils.load(bis, "utf-8"); - if (log.isDebug()) log.debug("readContent from text file done."); + + try(BufferedReader br = new BufferedReader(new FileReader(pdfTextFile)); + LimitedContentWriter sb = new LimitedContentWriter(5000, FileDocumentFactory.getMaxFileSize())) { + //search the title + char[] cbuf = new char[4096]; + int length = br.read(cbuf); + int indexSep = 0; + String title = ""; - int indexSep = text.indexOf("\u00A0|\u00A0"); - if(indexSep > 0) { - return new FileContent(text.substring(0, indexSep), text.substring(indexSep + 3, text.length())); - } - return new FileContent(text); - } finally { - if (bis != null) { - bis.close(); + if(length > 0) { + String firstChunk = new String(cbuf, 0, length); + indexSep = firstChunk.indexOf("\u00A0|\u00A0"); + if(indexSep > 0) { + title = firstChunk.substring(0, indexSep); + sb.append(firstChunk.substring(indexSep + 3)); + } else { + sb.append(firstChunk); + } + while((length = br.read(cbuf)) > 0) { + sb.write(cbuf, 0, length); + } } + + return new FileContent(title, sb.toString()); + } catch(IOException e) { + throw e; } } diff --git a/src/main/java/org/olat/search/service/document/file/PowerPointDocument.java b/src/main/java/org/olat/search/service/document/file/PowerPointDocument.java index 2b179c7b072..ccca99d9c89 100644 --- a/src/main/java/org/olat/search/service/document/file/PowerPointDocument.java +++ b/src/main/java/org/olat/search/service/document/file/PowerPointDocument.java @@ -26,10 +26,9 @@ package org.olat.search.service.document.file; import java.io.BufferedInputStream; -import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; -import java.io.OutputStream; +import java.io.Writer; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -42,6 +41,7 @@ import org.apache.poi.util.LittleEndian; import org.olat.core.gui.util.CSSHelper; import org.olat.core.logging.OLog; import org.olat.core.logging.Tracing; +import org.olat.core.util.io.LimitedContentWriter; import org.olat.core.util.vfs.VFSLeaf; import org.olat.search.service.SearchResourceContext; @@ -67,85 +67,74 @@ public class PowerPointDocument extends FileDocument { if (log.isDebug()) log.debug(powerPointDocument.toString()); return powerPointDocument.getLuceneDocument(); } - - - public FileContent readContent(VFSLeaf leaf) throws IOException,DocumentException { - BufferedInputStream bis = null; - OutputStream oStream = null; - if (log.isDebug()) log.debug("read PPT Content of leaf=" + leaf.getName()); - try { - bis = new BufferedInputStream(leaf.getInputStream()); - oStream = new ByteArrayOutputStream(); - extractText(bis, oStream); - String content = oStream.toString(); - return new FileContent(removeUnvisibleChars(content)); + @Override + public FileContent readContent(VFSLeaf leaf) throws IOException,DocumentException { + if (log.isDebug()) log.debug("read PPT Content of leaf=" + leaf.getName()); + try (BufferedInputStream bis = new BufferedInputStream(leaf.getInputStream())) { + LimitedContentWriter oStream = new LimitedContentWriter(100000, FileDocumentFactory.getMaxFileSize()); + extractText(bis, oStream); + return new FileContent(oStream.toString()); } catch (Exception e) { - throw new DocumentException("Can not read PPT Content. File=" + leaf.getName() ); - } finally { - if (bis != null) { - bis.close(); - } - if (oStream != null) { - oStream.close(); - } + throw new DocumentException("Can not read PPT Content. File=" + leaf.getName(), e); } - } - - /** - * Remove unvisible chars form input string. - * @param inputString - * @return Return filtered string - */ - private String removeUnvisibleChars(String inputString) { - Pattern p = Pattern.compile("[^a-zA-Z0-9\n\r!&#<>{}]"); - Matcher m = p.matcher(inputString); - String output = m.replaceAll(" "); - return output; - } - + } - private void extractText(InputStream inStream, OutputStream stream ) throws IOException { - POIFSReader r = new POIFSReader(); - /* Register a listener for *all* documents. */ - r.registerListener(new MyPOIFSReaderListener(stream)); - r.read(inStream); - } + private void extractText(InputStream inStream, Writer outWriter) throws IOException { + POIFSReader r = new POIFSReader(); + /* Register a listener for *all* documents. */ + r.registerListener(new MyPOIFSReaderListener(outWriter)); + r.read(inStream); + } - private class MyPOIFSReaderListener implements POIFSReaderListener { - private final OutputStream oStream; + private static class MyPOIFSReaderListener implements POIFSReaderListener { + private final Writer oStream; - public MyPOIFSReaderListener(OutputStream oStream) { + public MyPOIFSReaderListener(Writer oStream) { this.oStream = oStream; } + @Override public void processPOIFSReaderEvent(POIFSReaderEvent event) { - int errorCounter = 0; + int errorCounter = 0; - try { - DocumentInputStream dis = null; - dis = event.getStream(); + try { + DocumentInputStream dis = event.getStream(); - byte btoWrite[] = new byte[dis.available()]; - dis.read(btoWrite, 0, dis.available()); - for (int i = 0; i < btoWrite.length - 20; i++) { - long type = LittleEndian.getUShort(btoWrite, i + 2); - long size = LittleEndian.getUInt(btoWrite, i + 4); - if (type == 4008) { - try { - oStream.write(btoWrite, i + 4 + 1, (int) size + 3); - } catch( IndexOutOfBoundsException ex) { - errorCounter++; - } - } - } - } catch (Exception ex) { - // FIXME:chg: Remove general Exception later, for now make it run - log.warn("Can not read PPT content.", ex); - } - if (errorCounter > 0) { - if (log.isDebug()) log.debug("Could not parse ppt properly. There were " + errorCounter + " IndexOutOfBoundsException"); - } - } + byte btoWrite[] = new byte[dis.available()]; + dis.read(btoWrite, 0, dis.available()); + for (int i = 0; i < btoWrite.length - 20; i++) { + long type = LittleEndian.getUShort(btoWrite, i + 2); + long size = LittleEndian.getUInt(btoWrite, i + 4); + if (type == 4008) { + try { + String chunk = new String(btoWrite, i + 4 + 1, (int)(size + 3)); + oStream.write(removeUnvisibleChars(chunk)); + } catch( IndexOutOfBoundsException ex) { + errorCounter++; + } + } + } + } catch (Exception ex) { + // Remove general Exception later, for now make it run + log.warn("Can not read PPT content.", ex); + } + if (errorCounter > 0 && log.isDebug()) { + log.debug("Could not parse ppt properly. There were " + errorCounter + " IndexOutOfBoundsException"); + } + } + + /** + * Remove unvisible chars form input string. + * + * @param inputString + * @return Return filtered string + */ + private String removeUnvisibleChars(String inputString) { + Pattern p = Pattern.compile("[^a-zA-Z0-9\n\r!&#<>{}]"); + Matcher m = p.matcher(inputString); + String output = m.replaceAll(" "); + return output; + } } } diff --git a/src/main/java/org/olat/search/service/document/file/PowerPointOOXMLDocument.java b/src/main/java/org/olat/search/service/document/file/PowerPointOOXMLDocument.java index 2212ea25733..907bf19e63a 100644 --- a/src/main/java/org/olat/search/service/document/file/PowerPointOOXMLDocument.java +++ b/src/main/java/org/olat/search/service/document/file/PowerPointOOXMLDocument.java @@ -19,25 +19,31 @@ */ package org.olat.search.service.document.file; +import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.util.ArrayList; +import java.util.Enumeration; +import java.util.List; import java.util.zip.ZipEntry; -import java.util.zip.ZipInputStream; +import java.util.zip.ZipFile; import org.apache.lucene.document.Document; import org.olat.core.gui.util.CSSHelper; import org.olat.core.logging.OLog; import org.olat.core.logging.Tracing; -import org.olat.core.util.FileUtils; +import org.olat.core.util.io.LimitedContentWriter; import org.olat.core.util.io.ShieldInputStream; +import org.olat.core.util.vfs.JavaIOItem; import org.olat.core.util.vfs.VFSLeaf; import org.olat.search.service.SearchResourceContext; -import org.olat.search.service.document.file.utils.SlicedDocument; import org.xml.sax.InputSource; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.XMLReaderFactory; +import edu.emory.mathcs.backport.java.util.Collections; + /** * * Description:<br> @@ -54,8 +60,8 @@ public class PowerPointOOXMLDocument extends FileDocument { public final static String POWERPOINT_FILE_TYPE = "type.file.ppt"; private static final String SLIDE = "ppt/slides/slide"; - public static Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) throws IOException, DocumentException, - DocumentAccessException { + public static Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) + throws IOException, DocumentException, DocumentAccessException { PowerPointOOXMLDocument officeDocument = new PowerPointOOXMLDocument(); officeDocument.init(leafResourceContext, leaf); officeDocument.setFileType(POWERPOINT_FILE_TYPE); @@ -68,36 +74,41 @@ public class PowerPointOOXMLDocument extends FileDocument { @Override public FileContent readContent(VFSLeaf leaf) throws IOException, DocumentException { - SlicedDocument doc = new SlicedDocument(); - InputStream stream = null; - ZipInputStream zip = null; - try { - stream = leaf.getInputStream(); - - zip = new ZipInputStream(stream); - ZipEntry entry = zip.getNextEntry(); - while (entry != null) { + File file = ((JavaIOItem)leaf).getBasefile(); + + LimitedContentWriter writer = new LimitedContentWriter(100000, FileDocumentFactory.getMaxFileSize()); + + try(ZipFile wordFile = new ZipFile(file)) { + List<String> contents = new ArrayList<>(); + for(Enumeration<? extends ZipEntry> entriesEnumeration=wordFile.entries(); entriesEnumeration.hasMoreElements(); ) { + ZipEntry entry = entriesEnumeration.nextElement(); String name = entry.getName(); if(name.startsWith(SLIDE) && name.endsWith(".xml")) { - int lastIndex = name.indexOf(".xml"); - String position = name.substring(SLIDE.length(), lastIndex); - - OfficeDocumentHandler dh = new OfficeDocumentHandler(); + contents.add(name); + } + } + + if(contents.size() > 1) { + Collections.sort(contents, new PowerPointDocumentComparator()); + } + + for(String content:contents) { + if(writer.accept()) { + ZipEntry entry = wordFile.getEntry(content); + InputStream zip = wordFile.getInputStream(entry); + OfficeDocumentHandler dh = new OfficeDocumentHandler(writer); parse(new ShieldInputStream(zip), dh); - doc.setContent(Integer.parseInt(position), dh.getContent()); + zip.close(); } - entry = zip.getNextEntry(); } + } catch (DocumentException e) { throw e; } catch (Exception e) { throw new DocumentException(e.getMessage()); - } finally { - FileUtils.closeSafely(zip); - FileUtils.closeSafely(stream); } - return new FileContent(doc.toStringAndClear()); + return new FileContent(writer.toString()); } private void parse(InputStream stream, DefaultHandler handler) throws DocumentException { @@ -106,7 +117,7 @@ public class PowerPointOOXMLDocument extends FileDocument { parser.setContentHandler(handler); parser.setEntityResolver(handler); try { - parser.setFeature("http://xml.org/sax/features/validation", false); + parser.setFeature("http://xml.org/sax/features/validation", false); } catch(Exception e) { log.error("Cannot deactivate validation", e); } @@ -116,11 +127,11 @@ public class PowerPointOOXMLDocument extends FileDocument { } } - private class OfficeDocumentHandler extends DefaultHandler { - private final StringBuilder sb = new StringBuilder(); + private static class OfficeDocumentHandler extends DefaultHandler { + private final LimitedContentWriter sb; - public StringBuilder getContent() { - return sb; + public OfficeDocumentHandler(LimitedContentWriter sb) { + this.sb = sb; } @Override @@ -133,7 +144,28 @@ public class PowerPointOOXMLDocument extends FileDocument { if(sb .length() > 0 && sb.charAt(sb.length() - 1) != ' '){ sb.append(' '); } - sb.append(ch, start, length); + sb.write(ch, start, length); } } + + public static class PowerPointDocumentComparator extends AbstractOfficeDocumentComparator { + + @Override + public int compare(String f1, String f2) { + int c = 0; + if(f1.startsWith(SLIDE) && f2.startsWith(SLIDE)) { + c = comparePosition(f1, f2, SLIDE); + } else if(f1.startsWith(SLIDE)) { + c = 1; + } else if(f2.startsWith(SLIDE)) { + c = -1; + } + + if(c == 0) { + c = f1.compareTo(f2); + } + return -c; + } + + } } \ No newline at end of file diff --git a/src/main/java/org/olat/search/service/document/file/TextDocument.java b/src/main/java/org/olat/search/service/document/file/TextDocument.java index b024b964353..90fd8a18f75 100644 --- a/src/main/java/org/olat/search/service/document/file/TextDocument.java +++ b/src/main/java/org/olat/search/service/document/file/TextDocument.java @@ -1,45 +1,46 @@ /** -* OLAT - Online Learning and Training<br> -* http://www.olat.org -* <p> -* Licensed under the Apache License, Version 2.0 (the "License"); <br> -* you may not use this file except in compliance with the License.<br> -* You may obtain a copy of the License at -* <p> -* http://www.apache.org/licenses/LICENSE-2.0 -* <p> -* Unless required by applicable law or agreed to in writing,<br> -* software distributed under the License is distributed on an "AS IS" BASIS, <br> -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br> -* See the License for the specific language governing permissions and <br> -* limitations under the License. -* <p> -* Copyright (c) since 2004 at Multimedia- & E-Learning Services (MELS),<br> -* University of Zurich, Switzerland. -* <hr> -* <a href="http://www.openolat.org"> -* OpenOLAT - Online Learning and Training</a><br> -* This file has been modified by the OpenOLAT community. Changes are licensed -* under the Apache 2.0 license as the original file. -*/ + * OLAT - Online Learning and Training<br> + * http://www.olat.org + * <p> + * Licensed under the Apache License, Version 2.0 (the "License"); <br> + * you may not use this file except in compliance with the License.<br> + * You may obtain a copy of the License at + * <p> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p> + * Unless required by applicable law or agreed to in writing,<br> + * software distributed under the License is distributed on an "AS IS" BASIS, <br> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br> + * See the License for the specific language governing permissions and <br> + * limitations under the License. + * <p> + * Copyright (c) since 2004 at Multimedia- & E-Learning Services (MELS),<br> + * University of Zurich, Switzerland. + * <hr> + * <a href="http://www.openolat.org"> + * OpenOLAT - Online Learning and Training</a><br> + * This file has been modified by the OpenOLAT community. Changes are licensed + * under the Apache 2.0 license as the original file. + */ package org.olat.search.service.document.file; import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; -import java.io.StringWriter; import java.io.Writer; import org.apache.lucene.document.Document; import org.olat.core.gui.util.CSSHelper; import org.olat.core.logging.OLog; import org.olat.core.logging.Tracing; +import org.olat.core.util.io.LimitedContentWriter; import org.olat.core.util.vfs.VFSLeaf; import org.olat.search.service.SearchResourceContext; /** * Lucene document mapper. + * * @author Christian Guretzki */ public class TextDocument extends FileDocument { @@ -47,48 +48,42 @@ public class TextDocument extends FileDocument { private static final OLog log = Tracing.createLoggerFor(TextDocument.class); public final static String FILE_TYPE = "type.file.text"; - /** - * Limit the maximal size of the content read to index. - */ - public static int MAX_SIZE = 200000; public TextDocument() { // } - - public static Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) throws IOException,DocumentException,DocumentAccessException { - TextDocument textDocument = new TextDocument(); - textDocument.init(leafResourceContext,leaf); - textDocument.setFileType(FILE_TYPE); - textDocument.setCssIcon(CSSHelper.createFiletypeIconCssClassFor(leaf.getName())); - if (log.isDebug() ) log.debug(textDocument.toString()); + + public static Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) + throws IOException, DocumentException, DocumentAccessException { + + TextDocument textDocument = new TextDocument(); + textDocument.init(leafResourceContext, leaf); + textDocument.setFileType(FILE_TYPE); + textDocument.setCssIcon(CSSHelper.createFiletypeIconCssClassFor(leaf + .getName())); + if (log.isDebug()) + log.debug(textDocument.toString()); return textDocument.getLuceneDocument(); } - + @Override protected FileContent readContent(VFSLeaf leaf) throws IOException { - StringWriter out = new StringWriter(); - InputStreamReader in = new InputStreamReader(leaf.getInputStream()); + InputStreamReader in = new InputStreamReader(leaf.getInputStream()); + LimitedContentWriter out = new LimitedContentWriter((int)leaf.getSize(), FileDocumentFactory.getMaxFileSize()); try { copy(in, out); } catch (Exception e) { log.error("", e); } - return new FileContent(out.toString()); + return new FileContent(out.toString()); } - - private void copy(Reader input, Writer output) - throws IOException { + + private void copy(Reader input, Writer output) throws IOException { char[] buffer = new char[4096]; - - int count = 0; - int n = 0; - while (-1 != (n = input.read(buffer))) { - output.write(buffer, 0, n); - count += n; - if(count >= MAX_SIZE) { - break; - } - } + + int n = 0; + while (-1 != (n = input.read(buffer))) { + output.write(buffer, 0, n); + } } } diff --git a/src/main/java/org/olat/search/service/document/file/WordDocument.java b/src/main/java/org/olat/search/service/document/file/WordDocument.java index 9cb083602eb..41ef3c11b95 100644 --- a/src/main/java/org/olat/search/service/document/file/WordDocument.java +++ b/src/main/java/org/olat/search/service/document/file/WordDocument.java @@ -1,32 +1,33 @@ /** -* OLAT - Online Learning and Training<br> -* http://www.olat.org -* <p> -* Licensed under the Apache License, Version 2.0 (the "License"); <br> -* you may not use this file except in compliance with the License.<br> -* You may obtain a copy of the License at -* <p> -* http://www.apache.org/licenses/LICENSE-2.0 -* <p> -* Unless required by applicable law or agreed to in writing,<br> -* software distributed under the License is distributed on an "AS IS" BASIS, <br> -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br> -* See the License for the specific language governing permissions and <br> -* limitations under the License. -* <p> -* Copyright (c) since 2004 at Multimedia- & E-Learning Services (MELS),<br> -* University of Zurich, Switzerland. -* <hr> -* <a href="http://www.openolat.org"> -* OpenOLAT - Online Learning and Training</a><br> -* This file has been modified by the OpenOLAT community. Changes are licensed -* under the Apache 2.0 license as the original file. -*/ + * OLAT - Online Learning and Training<br> + * http://www.olat.org + * <p> + * Licensed under the Apache License, Version 2.0 (the "License"); <br> + * you may not use this file except in compliance with the License.<br> + * You may obtain a copy of the License at + * <p> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p> + * Unless required by applicable law or agreed to in writing,<br> + * software distributed under the License is distributed on an "AS IS" BASIS, <br> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br> + * See the License for the specific language governing permissions and <br> + * limitations under the License. + * <p> + * Copyright (c) since 2004 at Multimedia- & E-Learning Services (MELS),<br> + * University of Zurich, Switzerland. + * <hr> + * <a href="http://www.openolat.org"> + * OpenOLAT - Online Learning and Training</a><br> + * This file has been modified by the OpenOLAT community. Changes are licensed + * under the Apache 2.0 license as the original file. + */ package org.olat.search.service.document.file; import java.io.BufferedInputStream; import java.io.IOException; +import java.io.Writer; import java.util.Iterator; import org.apache.lucene.document.Document; @@ -37,11 +38,13 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.olat.core.gui.util.CSSHelper; import org.olat.core.logging.OLog; import org.olat.core.logging.Tracing; +import org.olat.core.util.io.LimitedContentWriter; import org.olat.core.util.vfs.VFSLeaf; import org.olat.search.service.SearchResourceContext; /** * Lucene document mapper. + * * @author Christian Guretzki */ public class WordDocument extends FileDocument { @@ -53,68 +56,72 @@ public class WordDocument extends FileDocument { public WordDocument() { // } - - public static Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) throws IOException,DocumentException,DocumentAccessException { - WordDocument wordDocument = new WordDocument(); - wordDocument.init(leafResourceContext,leaf); - wordDocument.setFileType(FILE_TYPE); + + public static Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) + throws IOException, DocumentException, DocumentAccessException { + WordDocument wordDocument = new WordDocument(); + wordDocument.init(leafResourceContext, leaf); + wordDocument.setFileType(FILE_TYPE); wordDocument.setCssIcon(CSSHelper.createFiletypeIconCssClassFor(leaf.getName())); - if (log.isDebug()) log.debug(wordDocument.toString()); + if (log.isDebug()) + log.debug(wordDocument.toString()); return wordDocument.getLuceneDocument(); } - - protected FileContent readContent(VFSLeaf leaf) throws IOException,DocumentException { + + @Override + protected FileContent readContent(VFSLeaf leaf) throws IOException, + DocumentException { BufferedInputStream bis = null; - StringBuilder sb = new StringBuilder(); - try { - bis = new BufferedInputStream(leaf.getInputStream()); - POIFSFileSystem filesystem = new POIFSFileSystem(bis); - Iterator<?> entries = filesystem.getRoot().getEntries(); - while (entries.hasNext()) { - Entry entry = (Entry) entries.next(); - String name = entry.getName(); - if (!(entry instanceof DocumentEntry)) { - // Skip directory entries - } else if ("WordDocument".equals(name)) { - collectWordDocument(filesystem, sb); - } - } + LimitedContentWriter sb = new LimitedContentWriter((int)leaf.getSize(), FileDocumentFactory.getMaxFileSize()); + try { + bis = new BufferedInputStream(leaf.getInputStream()); + POIFSFileSystem filesystem = new POIFSFileSystem(bis); + Iterator<?> entries = filesystem.getRoot().getEntries(); + while (entries.hasNext()) { + Entry entry = (Entry) entries.next(); + String name = entry.getName(); + if (!(entry instanceof DocumentEntry)) { + // Skip directory entries + } else if ("WordDocument".equals(name)) { + collectWordDocument(filesystem, sb); + } + } return new FileContent(sb.toString()); } catch (Exception e) { - log.warn("could not read in word document: " + leaf + " please check, that this is not an docx/rtf/html file!"); + log.warn("could not read in word document: " + leaf + + " please check, that this is not an docx/rtf/html file!"); throw new DocumentException(e.getMessage()); } finally { if (bis != null) { - bis.close(); + bis.close(); } } } - - private void collectWordDocument(POIFSFileSystem filesystem, StringBuilder sb) - throws IOException { + + private void collectWordDocument(POIFSFileSystem filesystem, Writer sb) throws IOException { WordExtractor extractor = new WordExtractor(filesystem); - addTextIfAny(sb, extractor.getHeaderText()); - for (String paragraph : extractor.getParagraphText()) { - sb.append(paragraph).append(' '); - } + addTextIfAny(sb, extractor.getHeaderText()); + for (String paragraph : extractor.getParagraphText()) { + sb.append(paragraph).append(' '); + } - for (String paragraph : extractor.getFootnoteText()) { - sb.append(paragraph).append(' '); - } + for (String paragraph : extractor.getFootnoteText()) { + sb.append(paragraph).append(' '); + } - for (String paragraph : extractor.getCommentsText()) { - sb.append(paragraph).append(' '); - } + for (String paragraph : extractor.getCommentsText()) { + sb.append(paragraph).append(' '); + } - for (String paragraph : extractor.getEndnoteText()) { - sb.append(paragraph).append(' '); - } - addTextIfAny(sb, extractor.getFooterText()); + for (String paragraph : extractor.getEndnoteText()) { + sb.append(paragraph).append(' '); + } + addTextIfAny(sb, extractor.getFooterText()); } - - private void addTextIfAny(StringBuilder sb, String text) { + + private void addTextIfAny(Writer sb, String text) throws IOException { if (text != null && text.length() > 0) { sb.append(text).append(' '); - } + } } } diff --git a/src/main/java/org/olat/search/service/document/file/WordOOXMLDocument.java b/src/main/java/org/olat/search/service/document/file/WordOOXMLDocument.java index 310580c26d0..1cccd55e0d2 100644 --- a/src/main/java/org/olat/search/service/document/file/WordOOXMLDocument.java +++ b/src/main/java/org/olat/search/service/document/file/WordOOXMLDocument.java @@ -19,26 +19,31 @@ */ package org.olat.search.service.document.file; +import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.util.ArrayList; +import java.util.Enumeration; +import java.util.List; import java.util.zip.ZipEntry; -import java.util.zip.ZipInputStream; +import java.util.zip.ZipFile; import org.apache.lucene.document.Document; import org.olat.core.gui.util.CSSHelper; import org.olat.core.logging.OLog; import org.olat.core.logging.Tracing; -import org.olat.core.util.FileUtils; -import org.olat.core.util.StringHelper; +import org.olat.core.util.io.LimitedContentWriter; import org.olat.core.util.io.ShieldInputStream; +import org.olat.core.util.vfs.JavaIOItem; import org.olat.core.util.vfs.VFSLeaf; import org.olat.search.service.SearchResourceContext; -import org.olat.search.service.document.file.utils.SlicedDocument; import org.xml.sax.InputSource; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.XMLReaderFactory; +import edu.emory.mathcs.backport.java.util.Collections; + /** * * Description:<br> @@ -55,6 +60,7 @@ public class WordOOXMLDocument extends FileDocument { public final static String WORD_FILE_TYPE = "type.file.word"; private static final String HEADER = "word/header"; private static final String FOOTER = "word/footer"; + private static final String DOCUMENT = "word/document.xml"; public static Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) throws IOException, DocumentException, DocumentAccessException { @@ -71,58 +77,47 @@ public class WordOOXMLDocument extends FileDocument { @Override public FileContent readContent(VFSLeaf leaf) throws IOException, DocumentException { - SlicedDocument doc = new SlicedDocument(); - InputStream stream = null; - ZipInputStream zip = null; - try { - stream = leaf.getInputStream(); + File file = ((JavaIOItem)leaf).getBasefile(); - zip = new ZipInputStream(stream); - ZipEntry entry = zip.getNextEntry(); - while (entry != null) { + LimitedContentWriter writer = new LimitedContentWriter(100000, FileDocumentFactory.getMaxFileSize()); + + try(ZipFile wordFile = new ZipFile(file)) { + + List<String> contents = new ArrayList<>(); + for(Enumeration<? extends ZipEntry> entriesEnumeration=wordFile.entries(); entriesEnumeration.hasMoreElements(); ) { + ZipEntry entry = entriesEnumeration.nextElement(); String name = entry.getName(); if(name.endsWith("word/document.xml")) { - OfficeDocumentHandler dh = new OfficeDocumentHandler(); - parse(new ShieldInputStream(zip), dh); - doc.setContent(0, dh.getContent()); + contents.add(name); } else if(name.startsWith(HEADER) && name.endsWith(".xml")) { - String position = name.substring(HEADER.length(), name.indexOf(".xml")); - if(StringHelper.isLong(position)) { - try { - OfficeDocumentHandler dh = new OfficeDocumentHandler(); - parse(new ShieldInputStream(zip), dh); - doc.setHeader(Integer.parseInt(position), dh.getContent()); - } catch (NumberFormatException e) { - log.warn("", e); - //if position not a position, go head - } - } + contents.add(name); } else if(name.startsWith(FOOTER) && name.endsWith(".xml")) { - String position = name.substring(FOOTER.length(), name.indexOf(".xml")); - if(StringHelper.isLong(position)) { - try { - OfficeDocumentHandler dh = new OfficeDocumentHandler(); - parse(new ShieldInputStream(zip), dh); - doc.setFooter(Integer.parseInt(position), dh.getContent()); - } catch (NumberFormatException e) { - log.warn("", e); - //if position not a position, go head - } - } + contents.add(name); } - entry = zip.getNextEntry(); } + + if(contents.size() > 1) { + Collections.sort(contents, new WordDocumentComparator()); + } + + for(String content:contents) { + if(writer.accept()) { + ZipEntry entry = wordFile.getEntry(content); + InputStream zip = wordFile.getInputStream(entry); + OfficeDocumentHandler dh = new OfficeDocumentHandler(writer); + parse(new ShieldInputStream(zip), dh); + zip.close(); + } + } + } catch (DocumentException e) { throw e; } catch (Exception e) { - e.printStackTrace(); throw new DocumentException(e.getMessage()); - } finally { - FileUtils.closeSafely(zip); - FileUtils.closeSafely(stream); } - return new FileContent(doc.toStringAndClear()); + + return new FileContent(writer.toString()); } private void parse(InputStream stream, DefaultHandler handler) throws DocumentException { @@ -131,7 +126,7 @@ public class WordOOXMLDocument extends FileDocument { parser.setContentHandler(handler); parser.setEntityResolver(handler); try { - parser.setFeature("http://xml.org/sax/features/validation", false); + parser.setFeature("http://xml.org/sax/features/validation", false); } catch(Exception e) { log.error("Cannot deactivate validation", e); } @@ -141,11 +136,11 @@ public class WordOOXMLDocument extends FileDocument { } } - private class OfficeDocumentHandler extends DefaultHandler { - private final StringBuilder sb = new StringBuilder(); - - public StringBuilder getContent() { - return sb; + private static class OfficeDocumentHandler extends DefaultHandler { + private final LimitedContentWriter sb; + + public OfficeDocumentHandler(LimitedContentWriter sb) { + this.sb = sb; } @Override @@ -153,7 +148,39 @@ public class WordOOXMLDocument extends FileDocument { if(sb .length() > 0 && sb.charAt(sb.length() - 1) != ' '){ sb.append(' '); } - sb.append(ch, start, length); + sb.write(ch, start, length); + } + } + + public static class WordDocumentComparator extends AbstractOfficeDocumentComparator { + + @Override + public int compare(String f1, String f2) { + int c = 0; + if(f1.endsWith(DOCUMENT)) { + if(f2.startsWith(HEADER)) { + c = -1; + } else if(f2.startsWith(FOOTER)) { + c = 1; + } + } else if(f1.startsWith(HEADER)) { + if(f2.startsWith(DOCUMENT) || f2.startsWith(FOOTER)) { + c = 1; + } else if(f2.startsWith(HEADER)) { + c = comparePosition(f1, f2, HEADER); + } + } else if(f1.startsWith(FOOTER)) { + if(f2.startsWith(DOCUMENT) || f2.startsWith(HEADER)) { + c = -1; + } else if(f2.startsWith(FOOTER)) { + c = comparePosition(f1, f2, FOOTER); + } + } + + if(c == 0) { + c = f1.compareTo(f2); + } + return -c; } } } \ No newline at end of file diff --git a/src/main/java/org/olat/search/service/document/file/pdf/PdfBoxExtractor.java b/src/main/java/org/olat/search/service/document/file/pdf/PdfBoxExtractor.java index 31b2d622b73..82d7b3b507f 100644 --- a/src/main/java/org/olat/search/service/document/file/pdf/PdfBoxExtractor.java +++ b/src/main/java/org/olat/search/service/document/file/pdf/PdfBoxExtractor.java @@ -21,18 +21,19 @@ package org.olat.search.service.document.file.pdf; import java.io.BufferedInputStream; import java.io.File; -import java.io.FileOutputStream; +import java.io.FileWriter; import java.io.IOException; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.util.PDFTextStripper; import org.olat.core.logging.OLog; import org.olat.core.logging.Tracing; -import org.olat.core.util.FileUtils; import org.olat.core.util.StringHelper; +import org.olat.core.util.io.LimitedContentWriter; import org.olat.core.util.vfs.VFSLeaf; import org.olat.search.service.document.file.DocumentAccessException; -import org.olat.search.service.document.file.FileDocument.FileContent; +import org.olat.search.service.document.file.FileContent; +import org.olat.search.service.document.file.FileDocumentFactory; /** * @@ -52,11 +53,14 @@ public class PdfBoxExtractor implements PdfExtractor { } private void storePdfTextInBuffer(FileContent pdfText, File pdfTextFile) throws IOException { - FileOutputStream out = new FileOutputStream(pdfTextFile); - if(StringHelper.containsNonWhitespace(pdfText.getTitle())) { - FileUtils.save(out, pdfText.getTitle() + "\u00A0|\u00A0" + pdfText.getContent(), "utf-8"); - } else { - FileUtils.save(out, pdfText.getContent(), "utf-8"); + try(FileWriter out = new FileWriter(pdfTextFile)) { + if(StringHelper.containsNonWhitespace(pdfText.getTitle())) { + out.write(pdfText.getTitle()); + out.write("\u00A0|\u00A0"); + } + out.write(pdfText.getContent()); + } catch(IOException e) { + throw e; } } @@ -72,13 +76,19 @@ public class PdfBoxExtractor implements PdfExtractor { document.decrypt(""); } catch (Exception e) { log.warn("PDF is encrypted. Can not read content file=" + leaf.getName()); - return new FileContent(leaf.getName(), leaf.getName()); + LimitedContentWriter writer = new LimitedContentWriter(128, FileDocumentFactory.getMaxFileSize()); + writer.append(leaf.getName()); + writer.close(); + return new FileContent(leaf.getName(), writer.toString()); } } String title = getTitle(document); if (log.isDebug()) log.debug("readContent PDDocument loaded"); PDFTextStripper stripper = new PDFTextStripper(); - return new FileContent(title, stripper.getText(document)); + LimitedContentWriter writer = new LimitedContentWriter(50000, FileDocumentFactory.getMaxFileSize()); + stripper.writeText(document, writer); + writer.close(); + return new FileContent(title, writer.toString()); } finally { if (document != null) { document.close(); @@ -95,5 +105,4 @@ public class PdfBoxExtractor implements PdfExtractor { } return null; } - } diff --git a/src/main/java/org/olat/search/service/document/file/utils/SlicedDocument.java b/src/main/java/org/olat/search/service/document/file/utils/SlicedDocument.java deleted file mode 100644 index 95fba7bc4da..00000000000 --- a/src/main/java/org/olat/search/service/document/file/utils/SlicedDocument.java +++ /dev/null @@ -1,87 +0,0 @@ -/** - * <a href="http://www.openolat.org"> - * OpenOLAT - Online Learning and Training</a><br> - * <p> - * Licensed under the Apache License, Version 2.0 (the "License"); <br> - * you may not use this file except in compliance with the License.<br> - * You may obtain a copy of the License at the - * <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache homepage</a> - * <p> - * Unless required by applicable law or agreed to in writing,<br> - * software distributed under the License is distributed on an "AS IS" BASIS, <br> - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br> - * See the License for the specific language governing permissions and <br> - * limitations under the License. - * <p> - * Initial code contributed and copyrighted by<br> - * frentix GmbH, http://www.frentix.com - * <p> - */ -package org.olat.search.service.document.file.utils; - -import java.util.ArrayList; -import java.util.List; - -/** - * - * Utility class to help reorder the slides/sheets/fotters/headers of XML - * documents which are sliced in different numbered XML files - * - * <P> - * Initial Date: 5 nov. 2012<br> - * @author srosse, stephane.rosse@frentix.com, http://www.frentix.com - */ -public class SlicedDocument { - - private List<StringBuilder> headers = new ArrayList<StringBuilder>(); - private List<StringBuilder> documents = new ArrayList<StringBuilder>(); - private List<StringBuilder> footers = new ArrayList<StringBuilder>(); - - private int size; - - public void setHeader(int index, StringBuilder doc) { - ensureSize(headers, index); - headers.set(index, doc); - size += doc.length(); - } - - public void setContent(int index, StringBuilder doc) { - ensureSize(documents, index); - documents.set(index, doc); - size += doc.length(); - } - - public void setFooter(int index, StringBuilder doc) { - ensureSize(footers, index); - footers.set(index, doc); - size += doc.length(); - } - - private final void ensureSize(List<StringBuilder> list, int index) { - if(list.size() <= index) { - for(int i=list.size(); i< (index+20); i++) { - list.add(null); - } - } - } - - private final List<StringBuilder> toStringAndClear(StringBuilder content, List<StringBuilder> list) { - if(list != null && !list.isEmpty()) { - for(StringBuilder document:list) { - if(document != null) { - content.append(document).append('\n'); - } - } - list.clear(); - } - return null; - } - - public String toStringAndClear() { - StringBuilder content = new StringBuilder(size + 100); - headers = toStringAndClear(content, headers); - documents = toStringAndClear(content, documents); - footers = toStringAndClear(content, footers); - return content.toString(); - } -} diff --git a/src/test/java/org/olat/search/service/document/file/FileDocumentFactoryTest.java b/src/test/java/org/olat/search/service/document/file/FileDocumentFactoryTest.java index 8ef724ff0be..d3f9663c2f6 100644 --- a/src/test/java/org/olat/search/service/document/file/FileDocumentFactoryTest.java +++ b/src/test/java/org/olat/search/service/document/file/FileDocumentFactoryTest.java @@ -94,9 +94,9 @@ public class FileDocumentFactoryTest extends OlatTestCase { assertTrue("xml must be supported", fileDocumentFactory.isFileSupported(new LocalFileImpl(new File("test.xml")))); } - private VFSLeaf getVFSFile(String name) { + private VFSLeaf getVFSFile(String filename) { try { - URL url = FileDocumentFactoryTest.class.getResource(name); + URL url = FileDocumentFactoryTest.class.getResource(filename); File file = new File(url.toURI()); return new LocalFileImpl(file); } catch (URISyntaxException e) { @@ -105,7 +105,8 @@ public class FileDocumentFactoryTest extends OlatTestCase { } } - @Test public void testCreateHtmlDocument() { + @Test + public void testCreateHtmlDocument() { String filePath = "SearchTestFolder"; String htmlFileName = "test.html"; String htmlText = "<html><head><meta name=\"generator\" content=\"olat-tinymce-1\"><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"></head><body>" diff --git a/src/test/java/org/olat/search/service/document/file/OfficeDocumentTest.java b/src/test/java/org/olat/search/service/document/file/OfficeDocumentTest.java index 2928c1c4f2d..3d91b7f4ada 100644 --- a/src/test/java/org/olat/search/service/document/file/OfficeDocumentTest.java +++ b/src/test/java/org/olat/search/service/document/file/OfficeDocumentTest.java @@ -23,12 +23,16 @@ import java.io.File; import java.io.IOException; import java.net.URISyntaxException; import java.net.URL; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; import org.junit.Assert; import org.junit.Test; import org.olat.core.util.vfs.VFSLeaf; -import org.olat.search.service.document.file.FileDocument.FileContent; +import org.olat.test.OlatTestCase; import org.olat.test.VFSJavaIOFile; +import org.springframework.beans.factory.annotation.Autowired; /** * Test the low memory text extractor for OpenXML (Microsoft Office XML) @@ -36,7 +40,10 @@ import org.olat.test.VFSJavaIOFile; * * @author srosse, stephane.rosse@frentix.com, http://www.frentix.com */ -public class OfficeDocumentTest { +public class OfficeDocumentTest extends OlatTestCase { + + @Autowired + private FileDocumentFactory fileDocumentFactory; @Test public void testWordOpenXMLDocument() throws IOException, DocumentException, DocumentAccessException, URISyntaxException { @@ -52,6 +59,28 @@ public class OfficeDocumentTest { Assert.assertTrue(body.contains("They prefer to start writing a document at home in desktop or laptop computer")); } + @Test + public void testWordOOXMLDocumentComparator() { + List<String> docs = new ArrayList<>(); + docs.add("word/document.xml"); + docs.add("word/header1.xml"); + docs.add("word/footer3.xml"); + docs.add("word/footer.xml"); + docs.add("word/footer14.xml"); + docs.add("word/header4.xml"); + docs.add("word/header25.xml"); + + Collections.sort(docs, new WordOOXMLDocument.WordDocumentComparator()); + + Assert.assertEquals("word/header1.xml", docs.get(0)); + Assert.assertEquals("word/header4.xml", docs.get(1)); + Assert.assertEquals("word/header25.xml", docs.get(2)); + Assert.assertEquals("word/document.xml", docs.get(3)); + Assert.assertEquals("word/footer.xml", docs.get(4)); + Assert.assertEquals("word/footer3.xml", docs.get(5)); + Assert.assertEquals("word/footer14.xml", docs.get(6)); + } + @Test public void testWordDocument() throws IOException, DocumentException, DocumentAccessException, URISyntaxException { URL docUrl = OfficeDocumentTest.class.getResource("Test_word_indexing.doc"); @@ -106,6 +135,28 @@ public class OfficeDocumentTest { Assert.assertTrue(body.contains("Here is some text")); } + @Test + public void testPowerPointOOXMLDocumentComparator() { + List<String> docs = new ArrayList<>(); + docs.add("word/dru.xml"); + docs.add("ppt/slides/slide9.xml"); + docs.add("ppt/slides/slide6.xml"); + docs.add("ppt/slides/slide25.xml"); + docs.add("ppt/slides/slide.xml"); + docs.add("ppt/slides/slide12.xml"); + docs.add("ppt/slides/slide3.xml"); + + Collections.sort(docs, new PowerPointOOXMLDocument.PowerPointDocumentComparator()); + + Assert.assertEquals("ppt/slides/slide.xml", docs.get(0)); + Assert.assertEquals("ppt/slides/slide3.xml", docs.get(1)); + Assert.assertEquals("ppt/slides/slide6.xml", docs.get(2)); + Assert.assertEquals("ppt/slides/slide9.xml", docs.get(3)); + Assert.assertEquals("ppt/slides/slide12.xml", docs.get(4)); + Assert.assertEquals("ppt/slides/slide25.xml", docs.get(5)); + Assert.assertEquals("word/dru.xml", docs.get(6)); + } + @Test public void testPowerPointDocument() throws IOException, DocumentException, DocumentAccessException, URISyntaxException { URL docUrl = OfficeDocumentTest.class.getResource("Test_ppt_indexing.ppt"); diff --git a/src/test/java/org/olat/search/service/document/file/PDFDocumentTest.java b/src/test/java/org/olat/search/service/document/file/PDFDocumentTest.java index 97aacdf5a0f..12530e878c9 100644 --- a/src/test/java/org/olat/search/service/document/file/PDFDocumentTest.java +++ b/src/test/java/org/olat/search/service/document/file/PDFDocumentTest.java @@ -20,6 +20,7 @@ package org.olat.search.service.document.file; import java.io.File; +import java.io.IOException; import java.net.URISyntaxException; import java.net.URL; import java.util.UUID; @@ -27,7 +28,6 @@ import java.util.UUID; import org.junit.Assert; import org.junit.Test; import org.olat.core.util.vfs.VFSLeaf; -import org.olat.search.service.document.file.FileDocument.FileContent; import org.olat.test.OlatTestCase; import org.olat.test.VFSJavaIOFile; @@ -38,7 +38,8 @@ import org.olat.test.VFSJavaIOFile; public class PDFDocumentTest extends OlatTestCase { @Test - public void testPDFDocument() throws DocumentException, DocumentAccessException, URISyntaxException { + public void testPDFDocument() + throws DocumentException, DocumentAccessException, URISyntaxException, IOException { URL pdfUrl = PDFDocumentTest.class.getResource("Test_pdf_indexing.pdf"); Assert.assertNotNull(pdfUrl); @@ -48,11 +49,13 @@ public class PDFDocumentTest extends OlatTestCase { FileContent content = document.readContent(doc); Assert.assertNotNull(content); Assert.assertEquals("Test pdf indexing", content.getTitle()); - Assert.assertEquals("Un petit texte en français", content.getContent().trim()); + String body = content.getContent(); + Assert.assertEquals("Un petit texte en français", body.trim()); } @Test - public void testPDFDocumentCaching() throws DocumentException, DocumentAccessException, URISyntaxException { + public void testPDFDocumentCaching() + throws DocumentException, DocumentAccessException, URISyntaxException, IOException { URL pdfUrl = PDFDocumentTest.class.getResource("Test_pdf_indexing.pdf"); Assert.assertNotNull(pdfUrl); @@ -64,12 +67,14 @@ public class PDFDocumentTest extends OlatTestCase { FileContent contentIndexed = document.readContent(doc); Assert.assertNotNull(contentIndexed); Assert.assertEquals("Test pdf indexing", contentIndexed.getTitle()); - Assert.assertEquals("Un petit texte en français", contentIndexed.getContent().trim()); + String bodyIndexed = contentIndexed.getContent(); + Assert.assertEquals("Un petit texte en français", bodyIndexed.trim()); //take from the cache FileContent contentCached = document.readContent(doc); Assert.assertNotNull(contentCached); Assert.assertEquals("Test pdf indexing", contentCached.getTitle()); - Assert.assertEquals("Un petit texte en français", contentCached.getContent().trim()); + String cachedBody = contentCached.getContent(); + Assert.assertEquals("Un petit texte en français", cachedBody.trim()); } } \ No newline at end of file diff --git a/src/test/java/org/olat/test/VFSJavaIOFile.java b/src/test/java/org/olat/test/VFSJavaIOFile.java index b4a1313f7ef..d8ba423bae1 100644 --- a/src/test/java/org/olat/test/VFSJavaIOFile.java +++ b/src/test/java/org/olat/test/VFSJavaIOFile.java @@ -29,6 +29,7 @@ import org.olat.core.util.vfs.VFSConstants; import org.olat.core.util.vfs.VFSContainer; import org.olat.core.util.vfs.VFSItem; import org.olat.core.util.vfs.VFSLeaf; +import org.olat.core.util.vfs.JavaIOItem; import org.olat.core.util.vfs.VFSStatus; import org.olat.core.util.vfs.callbacks.VFSSecurityCallback; @@ -39,7 +40,7 @@ import org.olat.core.util.vfs.callbacks.VFSSecurityCallback; * * @author srosse, stephane.rosse@frentix.com, http://www.frentix.com */ -public class VFSJavaIOFile implements VFSLeaf { +public class VFSJavaIOFile implements VFSLeaf, JavaIOItem { private final String name; private final File file; @@ -58,6 +59,11 @@ public class VFSJavaIOFile implements VFSLeaf { return file != null && file.exists(); } + @Override + public File getBasefile() { + return file; + } + @Override public VFSItem resolve(String path) { return null; -- GitLab