From ec9c9b25f1fa1d0bd5f919b861c93d39a3e146f2 Mon Sep 17 00:00:00 2001 From: srosse <stephane.rosse@frentix.com> Date: Fri, 26 Oct 2018 10:09:00 +0200 Subject: [PATCH] OO-3295: allow reloading qpool index if deleted, tune pdf extractor --- .../service/document/file/pdf/PdfBoxExtractor.java | 6 ++++-- .../document/file/pdf/PdfExternalExtractor.java | 2 +- .../org/olat/search/service/indexer/JmsIndexer.java | 12 +++++++++--- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/olat/search/service/document/file/pdf/PdfBoxExtractor.java b/src/main/java/org/olat/search/service/document/file/pdf/PdfBoxExtractor.java index 82e9f0c4810..a73f2ffce94 100644 --- a/src/main/java/org/olat/search/service/document/file/pdf/PdfBoxExtractor.java +++ b/src/main/java/org/olat/search/service/document/file/pdf/PdfBoxExtractor.java @@ -64,7 +64,7 @@ public class PdfBoxExtractor implements PdfExtractor { } } - private FileContent extractTextFromPdf(VFSLeaf leaf) throws IOException, DocumentAccessException { + private FileContent extractTextFromPdf(VFSLeaf leaf) throws IOException { if (log.isDebug()) log.debug("readContent from pdf starts..."); try(BufferedInputStream bis = new BufferedInputStream(leaf.getInputStream()); @@ -72,6 +72,8 @@ public class PdfBoxExtractor implements PdfExtractor { String title = getTitle(document); if (log.isDebug()) log.debug("readContent PDDocument loaded"); PDFTextStripper stripper = new PDFTextStripper(); + stripper.setSortByPosition(true); + stripper.setSuppressDuplicateOverlappingText(true); LimitedContentWriter writer = new LimitedContentWriter(50000, FileDocumentFactory.getMaxFileSize()); stripper.writeText(document, writer); writer.close(); @@ -88,7 +90,7 @@ public class PdfBoxExtractor implements PdfExtractor { return new FileContent(leaf.getName(), writer.toString()); } catch(Exception e) { log.error("", e); - return null; + return new FileContent("", ""); } } diff --git a/src/main/java/org/olat/search/service/document/file/pdf/PdfExternalExtractor.java b/src/main/java/org/olat/search/service/document/file/pdf/PdfExternalExtractor.java index e59b3af14fc..d9872f14a63 100644 --- a/src/main/java/org/olat/search/service/document/file/pdf/PdfExternalExtractor.java +++ b/src/main/java/org/olat/search/service/document/file/pdf/PdfExternalExtractor.java @@ -66,7 +66,7 @@ public class PdfExternalExtractor implements PdfExtractor { return; } - List<String> cmds = new ArrayList<String>(); + List<String> cmds = new ArrayList<>(); cmds.add(searchModule.getPdfExternalIndexerCmd()); cmds.add(((LocalFileImpl)document).getBasefile().getAbsolutePath()); cmds.add(bufferFile.getAbsolutePath()); diff --git a/src/main/java/org/olat/search/service/indexer/JmsIndexer.java b/src/main/java/org/olat/search/service/indexer/JmsIndexer.java index 8f558ff03ee..82b7a6a02b6 100644 --- a/src/main/java/org/olat/search/service/indexer/JmsIndexer.java +++ b/src/main/java/org/olat/search/service/indexer/JmsIndexer.java @@ -334,9 +334,15 @@ public class JmsIndexer implements MessageListener, LifeFullIndexer, ConfigOnOff } private DirectoryReader getReader() throws IOException { - DirectoryReader newReader = DirectoryReader.openIfChanged(reader); - if(newReader != null) { - reader = newReader; + if(reader == null) { + File tempIndexDir = new File(permanentIndexPath); + Directory indexPath = FSDirectory.open(tempIndexDir.toPath()); + reader = DirectoryReader.open(indexPath); + } else { + DirectoryReader newReader = DirectoryReader.openIfChanged(reader); + if(newReader != null) { + reader = newReader; + } } return reader; } -- GitLab