From ec9c9b25f1fa1d0bd5f919b861c93d39a3e146f2 Mon Sep 17 00:00:00 2001
From: srosse <stephane.rosse@frentix.com>
Date: Fri, 26 Oct 2018 10:09:00 +0200
Subject: [PATCH] OO-3295: allow reloading qpool index if deleted, tune pdf
 extractor

---
 .../service/document/file/pdf/PdfBoxExtractor.java   |  6 ++++--
 .../document/file/pdf/PdfExternalExtractor.java      |  2 +-
 .../org/olat/search/service/indexer/JmsIndexer.java  | 12 +++++++++---
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/main/java/org/olat/search/service/document/file/pdf/PdfBoxExtractor.java b/src/main/java/org/olat/search/service/document/file/pdf/PdfBoxExtractor.java
index 82e9f0c4810..a73f2ffce94 100644
--- a/src/main/java/org/olat/search/service/document/file/pdf/PdfBoxExtractor.java
+++ b/src/main/java/org/olat/search/service/document/file/pdf/PdfBoxExtractor.java
@@ -64,7 +64,7 @@ public class PdfBoxExtractor implements PdfExtractor {
 		}
 	}
 	
-	private FileContent extractTextFromPdf(VFSLeaf leaf) throws IOException, DocumentAccessException {
+	private FileContent extractTextFromPdf(VFSLeaf leaf) throws IOException {
 		if (log.isDebug()) log.debug("readContent from pdf starts...");
 		
 		try(BufferedInputStream bis = new BufferedInputStream(leaf.getInputStream());
@@ -72,6 +72,8 @@ public class PdfBoxExtractor implements PdfExtractor {
 			String title = getTitle(document);
 			if (log.isDebug()) log.debug("readContent PDDocument loaded");
 			PDFTextStripper stripper = new PDFTextStripper();
+			stripper.setSortByPosition(true);
+			stripper.setSuppressDuplicateOverlappingText(true);
 			LimitedContentWriter writer = new LimitedContentWriter(50000, FileDocumentFactory.getMaxFileSize());
 			stripper.writeText(document, writer);
 			writer.close();
@@ -88,7 +90,7 @@ public class PdfBoxExtractor implements PdfExtractor {
 			return new FileContent(leaf.getName(), writer.toString());
 		} catch(Exception e) {
 			log.error("", e);
-			return null;
+			return new FileContent("", "");
 		}
 	}
 	
diff --git a/src/main/java/org/olat/search/service/document/file/pdf/PdfExternalExtractor.java b/src/main/java/org/olat/search/service/document/file/pdf/PdfExternalExtractor.java
index e59b3af14fc..d9872f14a63 100644
--- a/src/main/java/org/olat/search/service/document/file/pdf/PdfExternalExtractor.java
+++ b/src/main/java/org/olat/search/service/document/file/pdf/PdfExternalExtractor.java
@@ -66,7 +66,7 @@ public class PdfExternalExtractor implements PdfExtractor {
 			return;
 		}
 		
-		List<String> cmds = new ArrayList<String>();
+		List<String> cmds = new ArrayList<>();
 		cmds.add(searchModule.getPdfExternalIndexerCmd());
 		cmds.add(((LocalFileImpl)document).getBasefile().getAbsolutePath());
 		cmds.add(bufferFile.getAbsolutePath());
diff --git a/src/main/java/org/olat/search/service/indexer/JmsIndexer.java b/src/main/java/org/olat/search/service/indexer/JmsIndexer.java
index 8f558ff03ee..82b7a6a02b6 100644
--- a/src/main/java/org/olat/search/service/indexer/JmsIndexer.java
+++ b/src/main/java/org/olat/search/service/indexer/JmsIndexer.java
@@ -334,9 +334,15 @@ public class JmsIndexer implements MessageListener, LifeFullIndexer, ConfigOnOff
 	}
 	
 	private DirectoryReader getReader() throws IOException {
-		DirectoryReader newReader = DirectoryReader.openIfChanged(reader);
-		if(newReader != null) {
-			reader = newReader;
+		if(reader == null) {
+			File tempIndexDir = new File(permanentIndexPath);
+			Directory indexPath = FSDirectory.open(tempIndexDir.toPath());
+			reader = DirectoryReader.open(indexPath);
+		} else {
+			DirectoryReader newReader = DirectoryReader.openIfChanged(reader);
+			if(newReader != null) {
+				reader = newReader;
+			}
 		}
 		return reader;
 	}
-- 
GitLab