Skip to content
Snippets Groups Projects
Commit 5fedc04e authored by srosse's avatar srosse
Browse files

OO-2055: replace the regex patterns of the simple html tags filter with Neko

parent a4495366
No related branches found
No related tags found
No related merge requests found
......@@ -27,7 +27,8 @@ import java.util.HashSet;
import java.util.Set;
import org.cyberneko.html.parsers.SAXParser;
import org.olat.core.logging.LogDelegator;
import org.olat.core.logging.OLog;
import org.olat.core.logging.Tracing;
import org.olat.core.util.filter.Filter;
import org.olat.core.util.io.LimitedContentWriter;
import org.olat.search.service.document.file.FileDocumentFactory;
......@@ -45,7 +46,8 @@ import org.xml.sax.helpers.DefaultHandler;
* Initial Date: 2 dec. 2009 <br>
* @author srosse
*/
public class NekoHTMLFilter extends LogDelegator implements Filter {
public class NekoHTMLFilter implements Filter {
private static final OLog log = Tracing.createLoggerFor(NekoHTMLFilter.class);
public static final Set<String> blockTags = new HashSet<String>();
static {
......@@ -58,7 +60,9 @@ public class NekoHTMLFilter extends LogDelegator implements Filter {
}
public String filter(String original, boolean pretty) {
if (original == null) return null;
if(original == null) return null;
if(original.isEmpty()) return "";
try {
SAXParser parser = new SAXParser();
HTMLHandler contentHandler = new HTMLHandler((int)(original.length() * 0.66f), pretty);
......@@ -66,13 +70,13 @@ public class NekoHTMLFilter extends LogDelegator implements Filter {
parser.parse(new InputSource(new StringReader(original)));
return contentHandler.toString();
} catch (SAXException e) {
logError("", e);
log.error("", e);
return null;
} catch (IOException e) {
logError("", e);
log.error("", e);
return null;
} catch (Exception e) {
logError("", e);
log.error("", e);
return null;
}
}
......@@ -86,13 +90,13 @@ public class NekoHTMLFilter extends LogDelegator implements Filter {
parser.parse(new InputSource(in));
return contentHandler.getContent();
} catch (SAXException e) {
logError("", e);
log.error("", e);
return null;
} catch (IOException e) {
logError("", e);
log.error("", e);
return null;
} catch (Exception e) {
logError("", e);
log.error("", e);
return null;
}
}
......
......@@ -19,11 +19,20 @@
*/
package org.olat.core.util.filter.impl;
import java.util.regex.Pattern;
import java.io.IOException;
import java.io.StringReader;
import org.cyberneko.html.parsers.SAXParser;
import org.olat.core.logging.OLog;
import org.olat.core.logging.Tracing;
import org.olat.core.util.StringHelper;
import org.olat.core.util.filter.Filter;
import org.olat.core.util.io.LimitedContentWriter;
import org.olat.search.service.document.file.FileDocumentFactory;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
* Description:<br>
......@@ -44,32 +53,86 @@ import org.olat.core.util.filter.Filter;
*/
public class SimpleHTMLTagsFilter implements Filter {
private static final OLog log = Tracing.createLoggerFor(SimpleHTMLTagsFilter.class);
// match <p> <p/> <br> <br/>
private static final Pattern brAndPTagsPattern = Pattern.compile("<((br)|p|(BR)|P)( )*(/)?>");
// match </h1>..
private static final Pattern titleTagsPattern = Pattern.compile("</[hH][123456]>");
// match everything <....>
private static final Pattern stripHTMLTagsPattern = Pattern.compile("<(!|/)?\\w+((\\s+[\\w-]+(\\s*(=\\s*)?(?:\".*?\"|'.*?'|[^'\">\\s]+))?)+\\s*|\\s*)/?>");
// match entities
private static final Pattern htmlSpacePattern = Pattern.compile("&nbsp;");
/**
* @see org.olat.core.util.filter.Filter#filter(java.lang.String)
*/
@Override
public String filter(String original) {
if(original == null) return null;
if(original.isEmpty()) return "";
try {
if (original == null) return null;
//some strange chars let to infinite loop in the regexp and need to be replaced
String modified = original.replaceAll("\u00a0", " ");
modified = brAndPTagsPattern.matcher(modified).replaceAll(" ");
modified = titleTagsPattern.matcher(modified).replaceAll(" ");
if (log.isDebug()) log.debug("trying to remove all html tags from: "+modified);
modified = stripHTMLTagsPattern.matcher(modified).replaceAll("");
modified = htmlSpacePattern.matcher(modified).replaceAll(" ");
return modified;
} catch (Throwable e) {
log.error("Could not filter HTML tags. Using unfiltered string! Original string was::" + original, e);
return original;
SAXParser parser = new SAXParser();
HTMLHandler contentHandler = new HTMLHandler((int)original.length());
parser.setContentHandler(contentHandler);
parser.parse(new InputSource(new StringReader(original)));
String text = contentHandler.toString();
text = text.replace('\u00a0', ' ');
text = StringHelper.escapeHtml(text);
return text;
} catch (SAXException e) {
log.error("", e);
return null;
} catch (IOException e) {
log.error("", e);
return null;
} catch (Exception e) {
log.error("", e);
return null;
}
}
private static class HTMLHandler extends DefaultHandler {
private boolean collect = true;
private boolean consumeBlanck = false;
private final LimitedContentWriter content;
public HTMLHandler(int size) {
content = new LimitedContentWriter(size, FileDocumentFactory.getMaxFileSize());
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) {
String elem = localName.toLowerCase();
if("script".equals(elem)) {
collect = false;
// add a single whitespace before each block element but only if not there is not already a whitespace there
} else if("li".equals(elem)) {
content.append(" ");
} else if("br".equals(elem)) {
content.append(" ");
} else if(NekoHTMLFilter.blockTags.contains(elem) && content.length() > 0 && content.charAt(content.length() -1) != ' ' ) {
consumeBlanck = true;
}
}
@Override
public void characters(char[] chars, int offset, int length) {
if(collect) {
if(consumeBlanck) {
if(content.length() > 0 && content.charAt(content.length() -1) != ' ' && length > 0 && chars[offset] != ' ') {
content.append(' ');
}
consumeBlanck = false;
}
content.write(chars, offset, length);
}
}
@Override
public void endElement(String uri, String localName, String qName) {
String elem = localName.toLowerCase();
if("script".equals(elem)) {
collect = true;
} else if("li".equals(elem) || "p".equals(elem)) {
content.append(" ");
} else if(NekoHTMLFilter.blockTags.contains(elem) && content.length() > 0 && content.charAt(content.length() -1) != ' ' ) {
consumeBlanck = true;
}
}
@Override
public String toString() {
return content.toString();
}
}
}
......@@ -97,7 +97,7 @@ public class FileDocumentFactory {
}
public static int getMaxFileSize() {
return (int)searchModule.getMaxFileSize();
return searchModule == null ? 120000 : (int)searchModule.getMaxFileSize();
}
public Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf)
......
......@@ -54,6 +54,12 @@ public class NekoHTMLFilterTest{
Assert.assertEquals(result, filter.filter(input));
}
@Test
public void escaping() {
String output = filter.filter("Test &auml; test");
System.out.println(output);
}
@Test public void testPlainText() {
t(null, null);
t("", "");
......
......@@ -23,8 +23,7 @@ import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
import org.olat.core.util.filter.Filter;
/**
* Description:<br>
......@@ -34,16 +33,17 @@ import org.junit.runners.JUnit4;
* Initial Date: 14.07.2009 <br>
* @author gnaegi
*/
@RunWith(JUnit4.class)
public class SimpleHTMLTagsFilterTest {
protected SimpleHTMLTagsFilter filter;
private Filter filter;
@Before public void setup() {
@Before
public void setup() {
filter = new SimpleHTMLTagsFilter();
}
@After public void tearDown() {
@After
public void tearDown() {
filter = null;
}
......@@ -51,15 +51,17 @@ public class SimpleHTMLTagsFilterTest {
Assert.assertEquals(result, filter.filter(input));
}
@Test public void testPlainText() {
@Test
public void testPlainText() {
t(null, null);
t("", "");
t("hello world", "hello world");
t("hello \n \t \r world", "hello \n \t \r world");
t("hello \n \t \r world", "hello \n \t \n world");
t("1+2=3", "1+2=3");
}
@Test public void testSimpleTags() {
@Test
public void testSimpleTags() {
t("<b>hello</b> world", "hello world");
t("<b><i>hello</i></b> world", "hello world");
t("<b>h<i>ell</i>o</b> world", "hello world");
......@@ -67,7 +69,8 @@ public class SimpleHTMLTagsFilterTest {
t("<a ref='#bla' \n title='gugus'>hello</b> world", "hello world");
}
@Test public void testBRAndPReplacement() {
@Test
public void testBRAndPReplacement() {
t("<br>", " ");
t("<p>", " ");
t("<br >", " ");
......@@ -78,7 +81,8 @@ public class SimpleHTMLTagsFilterTest {
t("<p />", " ");
}
@Test public void testTagsWithAttributes() {
@Test
public void testTagsWithAttributes() {
t("<font color='red'>hello</font> world", "hello world");
t("<font color=\"red\">hello</font> world", "hello world");
t("<a href=\"#top\" color='=>top'>go up</a>", "go up");
......@@ -89,7 +93,8 @@ public class SimpleHTMLTagsFilterTest {
// Boundary test: this filter does NOT decode HTML entities. Use the
// NekoHTMLFilter if you need this feature!
@Test public void testTagsWithEntities() {
@Test
public void testTagsWithEntities() {
t("Gn&auml;gi", "Gn&auml;gi");
t("This is &copy; by frentix", "This is &copy; by frentix");
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment