Skip to content
Snippets Groups Projects
Commit 07f6daa4 authored by srosse's avatar srosse
Browse files

OO-1031: relax a little bit the parser (patch S. Clemenz), implements some unit tests

parent d22715d4
No related branches found
No related tags found
No related merge requests found
......@@ -134,6 +134,8 @@ public class SimpleHtmlParser {
// check for doctype
int docTypePos = cont.indexOf("<!DOCTYPE");
if (docTypePos == -1) docTypePos = cont.indexOf("<doctype");
if (docTypePos == -1) docTypePos = cont.toLowerCase().indexOf("<doctype");
if (docTypePos != -1 ) {
int endOfhtmlDocTypePos = cont.indexOf(">", docTypePos);
htmlDocType = cont.substring(docTypePos, endOfhtmlDocTypePos+1);
......@@ -174,13 +176,11 @@ public class SimpleHtmlParser {
}
} else {
// no head tag found - use everything between HTML and BODY tag to support those crippled pages as well
htmlHead = cont.substring((cont.indexOf(">", spos))+1, bodypos);
htmlHead = cont.substring((cont.indexOf(">", spos))+1, bodypos).toLowerCase();
}
if (htmlHead != null) {
// Filter out base tag
int bsPos = htmlHead.indexOf("<base ");
if (bsPos == -1) bsPos = htmlHead.indexOf("<BASE ");
if (bsPos == -1) bsPos = htmlHead.toLowerCase().indexOf("<BASE ");
if (bsPos != -1) {
int bePos = htmlHead.indexOf('>', bsPos + 6);
if (bePos > -1) {
......@@ -193,7 +193,8 @@ public class SimpleHtmlParser {
// olat and firefox problem
htmlHead = filterHeader(htmlHead);
// Filter out CSS definitions from HEAD
if (htmlHead.indexOf("text/css") > 0) ownCss = true;
if (htmlHead.indexOf("text/css") > 0) ownCss = true; // required for HTML 4.01
else if (htmlHead.indexOf("stylesheet") > 0) ownCss = true; // "purely advisory" for HTML 5
// Filter out character set
charsetName = checkForCharset(htmlHead);
}
......@@ -388,31 +389,9 @@ public class SimpleHtmlParser {
return LINK_REL.matcher(tag).find(6);
}
private String extractMatches(String in, Pattern pattern){
if (in == null) return null;
else {
StringBuilder out = new StringBuilder(128);
in = removeLineTerminators(in);
Matcher m = pattern.matcher(in);
while (m.find()) {
out.append(m.group());
}
return out.toString();
}
}
public String removeLineTerminators(String in) {
// String patternStr = "$^|[\\r\\n]+\\z";
// String replaceStr = " ";
// Pattern pattern = Pattern.compile(patternStr, Pattern.MULTILINE);
// Matcher matcher = pattern.matcher(inputStr);
// System.out.println(matcher.replaceAll(replaceStr));
// return matcher.replaceAll(replaceStr);
//the above does not work, grrr??? but it should remove all line terminators like the win and mac ones
return in.replaceAll("\\n", "");
}
}
/**
* @return Returns the htmlContent.
......
/**
* <a href="http://www.openolat.org">
* OpenOLAT - Online Learning and Training</a><br>
* <p>
* Licensed under the Apache License, Version 2.0 (the "License"); <br>
* you may not use this file except in compliance with the License.<br>
* You may obtain a copy of the License at the
* <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache homepage</a>
* <p>
* Unless required by applicable law or agreed to in writing,<br>
* software distributed under the License is distributed on an "AS IS" BASIS, <br>
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br>
* See the License for the specific language governing permissions and <br>
* limitations under the License.
* <p>
* Initial code contributed and copyrighted by<br>
* frentix GmbH, http://www.frentix.com
* <p>
*/
package org.olat.core.util;
import java.io.IOException;
import java.io.InputStream;
import junit.framework.Assert;
import org.apache.commons.io.IOUtils;
import org.junit.Test;
/**
*
* Initial date: 24.03.2014<br>
* @author srosse, stephane.rosse@frentix.com, http://www.frentix.com
*
*/
public class SimpleHtmlParserTest {
@Test
public void parse_minimalTinyMCEHTMLPage() throws IOException {
InputStream inHtml = SimpleHtmlParserTest.class.getResourceAsStream("simple_1.html");
String html = IOUtils.toString(inHtml);
SimpleHtmlParser parser = new SimpleHtmlParser(html);
Assert.assertEquals("<body>", parser.getBodyTag());
Assert.assertEquals("utf-8", parser.getCharsetName());
Assert.assertEquals("Hello", parser.getHtmlContent().trim());
Assert.assertEquals("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">", parser.getHtmlDocType());
Assert.assertEquals("<html xmlns=\"http://www.w3.org/1999/xhtml\">", parser.getXhtmlNamespaces().trim());
Assert.assertTrue(parser.isValidHtml());
Assert.assertFalse(parser.hasOwnCss());
}
@Test
public void parse_externHtmlEditor() throws IOException {
InputStream inHtml = SimpleHtmlParserTest.class.getResourceAsStream("simple_2.html");
String html = IOUtils.toString(inHtml);
SimpleHtmlParser parser = new SimpleHtmlParser(html);
Assert.assertEquals("<body>", parser.getBodyTag());
Assert.assertEquals("iso-2022-jp", parser.getCharsetName());
Assert.assertEquals("Generated", parser.getHtmlContent().trim());
Assert.assertEquals("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\"\n\t\"http://www.w3.org/TR/html4/loose.dtd\">", parser.getHtmlDocType());
Assert.assertNull(parser.getXhtmlNamespaces());
Assert.assertTrue(parser.isValidHtml());
Assert.assertFalse(parser.hasOwnCss());
}
@Test
public void parse_ErroHandling() throws IOException {
String html = "<html><body></body></html>";
SimpleHtmlParser parser = new SimpleHtmlParser(html);
Assert.assertEquals("<body>", parser.getBodyTag());
Assert.assertNull(parser.getCharsetName());
Assert.assertEquals("", parser.getHtmlContent().trim());
Assert.assertNull(parser.getHtmlDocType());
Assert.assertNull(parser.getXhtmlNamespaces());
Assert.assertTrue(parser.isValidHtml());
Assert.assertFalse(parser.hasOwnCss());
}
}
\ No newline at end of file
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html xmlns="http://www.w3.org/1999/xhtml">
<head><meta name="generator" content="olat-tinymce-3" />
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title></title></head><body>
Hello
</body></html>
\ No newline at end of file
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html lang="en">
<head>
<meta http-equiv="content-type" content="text/html; charset=iso-2022-jp">
<title>Untitled</title>
<meta name="generator" content="BBEdit 10.5">
</head>
<body>
Generated
</body>
</html>
\ No newline at end of file
......@@ -61,6 +61,7 @@ import org.junit.runners.Suite;
org.olat.core.util.StringHelperTest.class,
org.olat.core.util.FormatterTest.class,
org.olat.core.util.EncoderTest.class,
org.olat.core.util.SimpleHtmlParserTest.class,
org.olat.core.util.mail.manager.MailManagerTest.class,
org.olat.core.id.context.BusinessControlFactoryTest.class,
org.olat.core.id.context.HistoryManagerTest.class,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment