diff --git a/src/main/java/org/olat/core/util/filter/impl/NekoHTMLFilter.java b/src/main/java/org/olat/core/util/filter/impl/NekoHTMLFilter.java index 3370d64a25046f25a0daf179dd51875754815287..a3451a35dae5ce928874a268155a4008e854d833 100644 --- a/src/main/java/org/olat/core/util/filter/impl/NekoHTMLFilter.java +++ b/src/main/java/org/olat/core/util/filter/impl/NekoHTMLFilter.java @@ -27,7 +27,8 @@ import java.util.HashSet; import java.util.Set; import org.cyberneko.html.parsers.SAXParser; -import org.olat.core.logging.LogDelegator; +import org.olat.core.logging.OLog; +import org.olat.core.logging.Tracing; import org.olat.core.util.filter.Filter; import org.olat.core.util.io.LimitedContentWriter; import org.olat.search.service.document.file.FileDocumentFactory; @@ -45,7 +46,8 @@ import org.xml.sax.helpers.DefaultHandler; * Initial Date: 2 dec. 2009 <br> * @author srosse */ -public class NekoHTMLFilter extends LogDelegator implements Filter { +public class NekoHTMLFilter implements Filter { + private static final OLog log = Tracing.createLoggerFor(NekoHTMLFilter.class); public static final Set<String> blockTags = new HashSet<String>(); static { @@ -58,7 +60,9 @@ public class NekoHTMLFilter extends LogDelegator implements Filter { } public String filter(String original, boolean pretty) { - if (original == null) return null; + if(original == null) return null; + if(original.isEmpty()) return ""; + try { SAXParser parser = new SAXParser(); HTMLHandler contentHandler = new HTMLHandler((int)(original.length() * 0.66f), pretty); @@ -66,13 +70,13 @@ public class NekoHTMLFilter extends LogDelegator implements Filter { parser.parse(new InputSource(new StringReader(original))); return contentHandler.toString(); } catch (SAXException e) { - logError("", e); + log.error("", e); return null; } catch (IOException e) { - logError("", e); + log.error("", e); return null; } catch (Exception e) { - logError("", e); + log.error("", e); return null; } } @@ -86,13 +90,13 @@ public class NekoHTMLFilter extends LogDelegator implements Filter { parser.parse(new InputSource(in)); return contentHandler.getContent(); } catch (SAXException e) { - logError("", e); + log.error("", e); return null; } catch (IOException e) { - logError("", e); + log.error("", e); return null; } catch (Exception e) { - logError("", e); + log.error("", e); return null; } } diff --git a/src/main/java/org/olat/core/util/filter/impl/SimpleHTMLTagsFilter.java b/src/main/java/org/olat/core/util/filter/impl/SimpleHTMLTagsFilter.java index 1e0bba2de6db0d2ed77d9906d853a10468309ec2..49c61749825e5e65cf346610b3687113c1c5418f 100644 --- a/src/main/java/org/olat/core/util/filter/impl/SimpleHTMLTagsFilter.java +++ b/src/main/java/org/olat/core/util/filter/impl/SimpleHTMLTagsFilter.java @@ -19,11 +19,20 @@ */ package org.olat.core.util.filter.impl; -import java.util.regex.Pattern; +import java.io.IOException; +import java.io.StringReader; +import org.cyberneko.html.parsers.SAXParser; import org.olat.core.logging.OLog; import org.olat.core.logging.Tracing; +import org.olat.core.util.StringHelper; import org.olat.core.util.filter.Filter; +import org.olat.core.util.io.LimitedContentWriter; +import org.olat.search.service.document.file.FileDocumentFactory; +import org.xml.sax.Attributes; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; /** * Description:<br> @@ -44,32 +53,86 @@ import org.olat.core.util.filter.Filter; */ public class SimpleHTMLTagsFilter implements Filter { private static final OLog log = Tracing.createLoggerFor(SimpleHTMLTagsFilter.class); - // match <p> <p/> <br> <br/> - private static final Pattern brAndPTagsPattern = Pattern.compile("<((br)|p|(BR)|P)( )*(/)?>"); - // match </h1>.. - private static final Pattern titleTagsPattern = Pattern.compile("</[hH][123456]>"); - // match everything <....> - private static final Pattern stripHTMLTagsPattern = Pattern.compile("<(!|/)?\\w+((\\s+[\\w-]+(\\s*(=\\s*)?(?:\".*?\"|'.*?'|[^'\">\\s]+))?)+\\s*|\\s*)/?>"); - // match entities - private static final Pattern htmlSpacePattern = Pattern.compile(" "); - - /** - * @see org.olat.core.util.filter.Filter#filter(java.lang.String) - */ + + @Override public String filter(String original) { + if(original == null) return null; + if(original.isEmpty()) return ""; + try { - if (original == null) return null; - //some strange chars let to infinite loop in the regexp and need to be replaced - String modified = original.replaceAll("\u00a0", " "); - modified = brAndPTagsPattern.matcher(modified).replaceAll(" "); - modified = titleTagsPattern.matcher(modified).replaceAll(" "); - if (log.isDebug()) log.debug("trying to remove all html tags from: "+modified); - modified = stripHTMLTagsPattern.matcher(modified).replaceAll(""); - modified = htmlSpacePattern.matcher(modified).replaceAll(" "); - return modified; - } catch (Throwable e) { - log.error("Could not filter HTML tags. Using unfiltered string! Original string was::" + original, e); - return original; + SAXParser parser = new SAXParser(); + HTMLHandler contentHandler = new HTMLHandler((int)original.length()); + parser.setContentHandler(contentHandler); + parser.parse(new InputSource(new StringReader(original))); + String text = contentHandler.toString(); + text = text.replace('\u00a0', ' '); + text = StringHelper.escapeHtml(text); + return text; + } catch (SAXException e) { + log.error("", e); + return null; + } catch (IOException e) { + log.error("", e); + return null; + } catch (Exception e) { + log.error("", e); + return null; + } + } + + private static class HTMLHandler extends DefaultHandler { + private boolean collect = true; + private boolean consumeBlanck = false; + private final LimitedContentWriter content; + + public HTMLHandler(int size) { + content = new LimitedContentWriter(size, FileDocumentFactory.getMaxFileSize()); + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes attributes) { + String elem = localName.toLowerCase(); + if("script".equals(elem)) { + collect = false; + // add a single whitespace before each block element but only if not there is not already a whitespace there + } else if("li".equals(elem)) { + content.append(" "); + } else if("br".equals(elem)) { + content.append(" "); + } else if(NekoHTMLFilter.blockTags.contains(elem) && content.length() > 0 && content.charAt(content.length() -1) != ' ' ) { + consumeBlanck = true; + } + } + + @Override + public void characters(char[] chars, int offset, int length) { + if(collect) { + if(consumeBlanck) { + if(content.length() > 0 && content.charAt(content.length() -1) != ' ' && length > 0 && chars[offset] != ' ') { + content.append(' '); + } + consumeBlanck = false; + } + content.write(chars, offset, length); + } + } + + @Override + public void endElement(String uri, String localName, String qName) { + String elem = localName.toLowerCase(); + if("script".equals(elem)) { + collect = true; + } else if("li".equals(elem) || "p".equals(elem)) { + content.append(" "); + } else if(NekoHTMLFilter.blockTags.contains(elem) && content.length() > 0 && content.charAt(content.length() -1) != ' ' ) { + consumeBlanck = true; + } + } + + @Override + public String toString() { + return content.toString(); } } + } diff --git a/src/main/java/org/olat/search/service/document/file/FileDocumentFactory.java b/src/main/java/org/olat/search/service/document/file/FileDocumentFactory.java index 4a8b9c68889e94e56689da99ea298f2739786f42..997adc0b19211bb5e49d310592e4c108cc82c76f 100644 --- a/src/main/java/org/olat/search/service/document/file/FileDocumentFactory.java +++ b/src/main/java/org/olat/search/service/document/file/FileDocumentFactory.java @@ -97,7 +97,7 @@ public class FileDocumentFactory { } public static int getMaxFileSize() { - return (int)searchModule.getMaxFileSize(); + return searchModule == null ? 120000 : (int)searchModule.getMaxFileSize(); } public Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) diff --git a/src/test/java/org/olat/core/util/filter/impl/NekoHTMLFilterTest.java b/src/test/java/org/olat/core/util/filter/impl/NekoHTMLFilterTest.java index 416e907df4db9d76ee6d9f550efb1b6e81cb2fa5..7c2442e32ff465e8312f11e1298b6244e7c33e31 100644 --- a/src/test/java/org/olat/core/util/filter/impl/NekoHTMLFilterTest.java +++ b/src/test/java/org/olat/core/util/filter/impl/NekoHTMLFilterTest.java @@ -54,6 +54,12 @@ public class NekoHTMLFilterTest{ Assert.assertEquals(result, filter.filter(input)); } + @Test + public void escaping() { + String output = filter.filter("Test ä test"); + System.out.println(output); + } + @Test public void testPlainText() { t(null, null); t("", ""); diff --git a/src/test/java/org/olat/core/util/filter/impl/SimpleHTMLTagsFilterTest.java b/src/test/java/org/olat/core/util/filter/impl/SimpleHTMLTagsFilterTest.java index e96150ddf8fb6511f61a115a4cde2c7b550e2c94..99703b8ffc133c05e2801de388ebcf61b4f5509d 100644 --- a/src/test/java/org/olat/core/util/filter/impl/SimpleHTMLTagsFilterTest.java +++ b/src/test/java/org/olat/core/util/filter/impl/SimpleHTMLTagsFilterTest.java @@ -23,8 +23,7 @@ import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; +import org.olat.core.util.filter.Filter; /** * Description:<br> @@ -34,16 +33,17 @@ import org.junit.runners.JUnit4; * Initial Date: 14.07.2009 <br> * @author gnaegi */ -@RunWith(JUnit4.class) public class SimpleHTMLTagsFilterTest { - protected SimpleHTMLTagsFilter filter; + private Filter filter; - @Before public void setup() { + @Before + public void setup() { filter = new SimpleHTMLTagsFilter(); } - @After public void tearDown() { + @After + public void tearDown() { filter = null; } @@ -51,15 +51,17 @@ public class SimpleHTMLTagsFilterTest { Assert.assertEquals(result, filter.filter(input)); } - @Test public void testPlainText() { + @Test + public void testPlainText() { t(null, null); t("", ""); t("hello world", "hello world"); - t("hello \n \t \r world", "hello \n \t \r world"); + t("hello \n \t \r world", "hello \n \t \n world"); t("1+2=3", "1+2=3"); } - @Test public void testSimpleTags() { + @Test + public void testSimpleTags() { t("<b>hello</b> world", "hello world"); t("<b><i>hello</i></b> world", "hello world"); t("<b>h<i>ell</i>o</b> world", "hello world"); @@ -67,7 +69,8 @@ public class SimpleHTMLTagsFilterTest { t("<a ref='#bla' \n title='gugus'>hello</b> world", "hello world"); } - @Test public void testBRAndPReplacement() { + @Test + public void testBRAndPReplacement() { t("<br>", " "); t("<p>", " "); t("<br >", " "); @@ -78,7 +81,8 @@ public class SimpleHTMLTagsFilterTest { t("<p />", " "); } - @Test public void testTagsWithAttributes() { + @Test + public void testTagsWithAttributes() { t("<font color='red'>hello</font> world", "hello world"); t("<font color=\"red\">hello</font> world", "hello world"); t("<a href=\"#top\" color='=>top'>go up</a>", "go up"); @@ -89,7 +93,8 @@ public class SimpleHTMLTagsFilterTest { // Boundary test: this filter does NOT decode HTML entities. Use the // NekoHTMLFilter if you need this feature! - @Test public void testTagsWithEntities() { + @Test + public void testTagsWithEntities() { t("Gnägi", "Gnägi"); t("This is © by frentix", "This is © by frentix"); }