Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
OLAT CI-CD Testing Project
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Requirements
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Locked files
Deploy
Package Registry
Container Registry
Model registry
Operate
Terraform modules
Monitor
Service Desk
Analyze
Contributor analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Lars Oliver Dam
OLAT CI-CD Testing Project
Commits
cd94da69
Commit
cd94da69
authored
9 years ago
by
srosse
Browse files
Options
Downloads
Patches
Plain Diff
OO-1909: implement a fallback for old word format
parent
37681133
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/main/java/org/olat/search/service/document/file/WordDocument.java
+21
-10
21 additions, 10 deletions
...a/org/olat/search/service/document/file/WordDocument.java
with
21 additions
and
10 deletions
src/main/java/org/olat/search/service/document/file/WordDocument.java
+
21
−
10
View file @
cd94da69
...
@@ -27,10 +27,14 @@ package org.olat.search.service.document.file;
...
@@ -27,10 +27,14 @@ package org.olat.search.service.document.file;
import
java.io.BufferedInputStream
;
import
java.io.BufferedInputStream
;
import
java.io.IOException
;
import
java.io.IOException
;
import
java.io.InputStream
;
import
java.io.Writer
;
import
java.io.Writer
;
import
java.util.Iterator
;
import
java.util.Iterator
;
import
org.apache.lucene.document.Document
;
import
org.apache.lucene.document.Document
;
import
org.apache.poi.hwpf.HWPFOldDocument
;
import
org.apache.poi.hwpf.OldWordFileFormatException
;
import
org.apache.poi.hwpf.extractor.Word6Extractor
;
import
org.apache.poi.hwpf.extractor.WordExtractor
;
import
org.apache.poi.hwpf.extractor.WordExtractor
;
import
org.apache.poi.poifs.filesystem.DocumentEntry
;
import
org.apache.poi.poifs.filesystem.DocumentEntry
;
import
org.apache.poi.poifs.filesystem.Entry
;
import
org.apache.poi.poifs.filesystem.Entry
;
...
@@ -71,10 +75,8 @@ public class WordDocument extends FileDocument {
...
@@ -71,10 +75,8 @@ public class WordDocument extends FileDocument {
@Override
@Override
protected
FileContent
readContent
(
VFSLeaf
leaf
)
throws
IOException
,
protected
FileContent
readContent
(
VFSLeaf
leaf
)
throws
IOException
,
DocumentException
{
DocumentException
{
BufferedInputStream
bis
=
null
;
LimitedContentWriter
sb
=
new
LimitedContentWriter
((
int
)
leaf
.
getSize
(),
FileDocumentFactory
.
getMaxFileSize
());
LimitedContentWriter
sb
=
new
LimitedContentWriter
((
int
)
leaf
.
getSize
(),
FileDocumentFactory
.
getMaxFileSize
());
try
{
try
(
InputStream
bis
=
new
BufferedInputStream
(
leaf
.
getInputStream
()))
{
bis
=
new
BufferedInputStream
(
leaf
.
getInputStream
());
POIFSFileSystem
filesystem
=
new
POIFSFileSystem
(
bis
);
POIFSFileSystem
filesystem
=
new
POIFSFileSystem
(
bis
);
Iterator
<?>
entries
=
filesystem
.
getRoot
().
getEntries
();
Iterator
<?>
entries
=
filesystem
.
getRoot
().
getEntries
();
while
(
entries
.
hasNext
())
{
while
(
entries
.
hasNext
())
{
...
@@ -83,7 +85,7 @@ public class WordDocument extends FileDocument {
...
@@ -83,7 +85,7 @@ public class WordDocument extends FileDocument {
if
(!(
entry
instanceof
DocumentEntry
))
{
if
(!(
entry
instanceof
DocumentEntry
))
{
// Skip directory entries
// Skip directory entries
}
else
if
(
"WordDocument"
.
equals
(
name
))
{
}
else
if
(
"WordDocument"
.
equals
(
name
))
{
collectWordDocument
(
filesystem
,
sb
);
collectWordDocument
(
leaf
,
filesystem
,
sb
);
}
}
}
}
return
new
FileContent
(
sb
.
toString
());
return
new
FileContent
(
sb
.
toString
());
...
@@ -91,18 +93,27 @@ public class WordDocument extends FileDocument {
...
@@ -91,18 +93,27 @@ public class WordDocument extends FileDocument {
log
.
warn
(
"could not read in word document: "
+
leaf
log
.
warn
(
"could not read in word document: "
+
leaf
+
" please check, that this is not an docx/rtf/html file!"
);
+
" please check, that this is not an docx/rtf/html file!"
);
throw
new
DocumentException
(
e
.
getMessage
());
throw
new
DocumentException
(
e
.
getMessage
());
}
finally
{
if
(
bis
!=
null
)
{
bis
.
close
();
}
}
}
}
}
private
void
collectWordDocument
(
POIFSFileSystem
filesystem
,
Writer
sb
)
throws
IOException
{
private
void
collectWordDocument
(
VFSLeaf
leaf
,
POIFSFileSystem
filesystem
,
Writer
sb
)
throws
IOException
{
try
(
WordExtractor
extractor
=
new
WordExtractor
(
filesystem
))
{
try
(
WordExtractor
extractor
=
new
WordExtractor
(
filesystem
))
{
addTextIfAny
(
sb
,
extractor
.
getTextFromPieces
());
addTextIfAny
(
sb
,
extractor
.
getTextFromPieces
());
}
catch
(
OldWordFileFormatException
ex
)
{
collectOldWordDocument
(
leaf
,
sb
);
}
catch
(
Exception
e
)
{
log
.
error
(
"Cannot read word document: "
+
leaf
,
e
);
}
}
private
void
collectOldWordDocument
(
VFSLeaf
leaf
,
Writer
sb
)
throws
IOException
{
try
(
InputStream
bis
=
new
BufferedInputStream
(
leaf
.
getInputStream
()))
{
POIFSFileSystem
pfs
=
new
POIFSFileSystem
(
bis
);
HWPFOldDocument
doc
=
new
HWPFOldDocument
(
pfs
);
Word6Extractor
docExtractor
=
new
Word6Extractor
(
doc
);
addTextIfAny
(
sb
,
docExtractor
.
getText
());
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
log
.
error
(
"
"
,
e
);
log
.
error
(
"
Cannot read old word document: "
+
leaf
,
e
);
}
}
}
}
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment