diff --git a/src/main/java/org/olat/core/util/SimpleHtmlParser.java b/src/main/java/org/olat/core/util/SimpleHtmlParser.java index 05159785fe70ad9a23f166c46644c3c96641ce3d..7468933c29f2651a635bed1ba3973ff62feb5202 100644 --- a/src/main/java/org/olat/core/util/SimpleHtmlParser.java +++ b/src/main/java/org/olat/core/util/SimpleHtmlParser.java @@ -134,6 +134,8 @@ public class SimpleHtmlParser { // check for doctype int docTypePos = cont.indexOf("<!DOCTYPE"); + if (docTypePos == -1) docTypePos = cont.indexOf("<doctype"); + if (docTypePos == -1) docTypePos = cont.toLowerCase().indexOf("<doctype"); if (docTypePos != -1 ) { int endOfhtmlDocTypePos = cont.indexOf(">", docTypePos); htmlDocType = cont.substring(docTypePos, endOfhtmlDocTypePos+1); @@ -174,13 +176,11 @@ public class SimpleHtmlParser { } } else { // no head tag found - use everything between HTML and BODY tag to support those crippled pages as well - htmlHead = cont.substring((cont.indexOf(">", spos))+1, bodypos); + htmlHead = cont.substring((cont.indexOf(">", spos))+1, bodypos).toLowerCase(); } if (htmlHead != null) { // Filter out base tag int bsPos = htmlHead.indexOf("<base "); - if (bsPos == -1) bsPos = htmlHead.indexOf("<BASE "); - if (bsPos == -1) bsPos = htmlHead.toLowerCase().indexOf("<BASE "); if (bsPos != -1) { int bePos = htmlHead.indexOf('>', bsPos + 6); if (bePos > -1) { @@ -193,7 +193,8 @@ public class SimpleHtmlParser { // olat and firefox problem htmlHead = filterHeader(htmlHead); // Filter out CSS definitions from HEAD - if (htmlHead.indexOf("text/css") > 0) ownCss = true; + if (htmlHead.indexOf("text/css") > 0) ownCss = true; // required for HTML 4.01 + else if (htmlHead.indexOf("stylesheet") > 0) ownCss = true; // "purely advisory" for HTML 5 // Filter out character set charsetName = checkForCharset(htmlHead); } @@ -388,31 +389,9 @@ public class SimpleHtmlParser { return LINK_REL.matcher(tag).find(6); } - private String extractMatches(String in, Pattern pattern){ - if (in == null) return null; - else { - StringBuilder out = new StringBuilder(128); - in = removeLineTerminators(in); - Matcher m = pattern.matcher(in); - while (m.find()) { - out.append(m.group()); - } - return out.toString(); - } - } - public String removeLineTerminators(String in) { -// String patternStr = "$^|[\\r\\n]+\\z"; -// String replaceStr = " "; -// Pattern pattern = Pattern.compile(patternStr, Pattern.MULTILINE); -// Matcher matcher = pattern.matcher(inputStr); -// System.out.println(matcher.replaceAll(replaceStr)); -// return matcher.replaceAll(replaceStr); - //the above does not work, grrr??? but it should remove all line terminators like the win and mac ones return in.replaceAll("\\n", ""); -} - - + } /** * @return Returns the htmlContent. diff --git a/src/test/java/org/olat/core/util/SimpleHtmlParserTest.java b/src/test/java/org/olat/core/util/SimpleHtmlParserTest.java new file mode 100644 index 0000000000000000000000000000000000000000..6faca119883a18c20e81840272021f5826c686ab --- /dev/null +++ b/src/test/java/org/olat/core/util/SimpleHtmlParserTest.java @@ -0,0 +1,81 @@ +/** + * <a href="http://www.openolat.org"> + * OpenOLAT - Online Learning and Training</a><br> + * <p> + * Licensed under the Apache License, Version 2.0 (the "License"); <br> + * you may not use this file except in compliance with the License.<br> + * You may obtain a copy of the License at the + * <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache homepage</a> + * <p> + * Unless required by applicable law or agreed to in writing,<br> + * software distributed under the License is distributed on an "AS IS" BASIS, <br> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br> + * See the License for the specific language governing permissions and <br> + * limitations under the License. + * <p> + * Initial code contributed and copyrighted by<br> + * frentix GmbH, http://www.frentix.com + * <p> + */ +package org.olat.core.util; + +import java.io.IOException; +import java.io.InputStream; + +import junit.framework.Assert; + +import org.apache.commons.io.IOUtils; +import org.junit.Test; + +/** + * + * Initial date: 24.03.2014<br> + * @author srosse, stephane.rosse@frentix.com, http://www.frentix.com + * + */ +public class SimpleHtmlParserTest { + + @Test + public void parse_minimalTinyMCEHTMLPage() throws IOException { + InputStream inHtml = SimpleHtmlParserTest.class.getResourceAsStream("simple_1.html"); + String html = IOUtils.toString(inHtml); + + SimpleHtmlParser parser = new SimpleHtmlParser(html); + Assert.assertEquals("<body>", parser.getBodyTag()); + Assert.assertEquals("utf-8", parser.getCharsetName()); + Assert.assertEquals("Hello", parser.getHtmlContent().trim()); + Assert.assertEquals("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">", parser.getHtmlDocType()); + Assert.assertEquals("<html xmlns=\"http://www.w3.org/1999/xhtml\">", parser.getXhtmlNamespaces().trim()); + Assert.assertTrue(parser.isValidHtml()); + Assert.assertFalse(parser.hasOwnCss()); + } + + @Test + public void parse_externHtmlEditor() throws IOException { + InputStream inHtml = SimpleHtmlParserTest.class.getResourceAsStream("simple_2.html"); + String html = IOUtils.toString(inHtml); + + SimpleHtmlParser parser = new SimpleHtmlParser(html); + Assert.assertEquals("<body>", parser.getBodyTag()); + Assert.assertEquals("iso-2022-jp", parser.getCharsetName()); + Assert.assertEquals("Generated", parser.getHtmlContent().trim()); + Assert.assertEquals("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\"\n\t\"http://www.w3.org/TR/html4/loose.dtd\">", parser.getHtmlDocType()); + Assert.assertNull(parser.getXhtmlNamespaces()); + Assert.assertTrue(parser.isValidHtml()); + Assert.assertFalse(parser.hasOwnCss()); + } + + @Test + public void parse_ErroHandling() throws IOException { + String html = "<html><body></body></html>"; + + SimpleHtmlParser parser = new SimpleHtmlParser(html); + Assert.assertEquals("<body>", parser.getBodyTag()); + Assert.assertNull(parser.getCharsetName()); + Assert.assertEquals("", parser.getHtmlContent().trim()); + Assert.assertNull(parser.getHtmlDocType()); + Assert.assertNull(parser.getXhtmlNamespaces()); + Assert.assertTrue(parser.isValidHtml()); + Assert.assertFalse(parser.hasOwnCss()); + } +} \ No newline at end of file diff --git a/src/test/java/org/olat/core/util/simple_1.html b/src/test/java/org/olat/core/util/simple_1.html new file mode 100644 index 0000000000000000000000000000000000000000..dd4b171b7175014089fdb7c945dcf8a70b927695 --- /dev/null +++ b/src/test/java/org/olat/core/util/simple_1.html @@ -0,0 +1,7 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html xmlns="http://www.w3.org/1999/xhtml"> +<head><meta name="generator" content="olat-tinymce-3" /> +<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> +<title></title></head><body> +Hello +</body></html> \ No newline at end of file diff --git a/src/test/java/org/olat/core/util/simple_2.html b/src/test/java/org/olat/core/util/simple_2.html new file mode 100644 index 0000000000000000000000000000000000000000..4652f4e84cafc2868ee4e642ed5c8a73b5a2d636 --- /dev/null +++ b/src/test/java/org/olat/core/util/simple_2.html @@ -0,0 +1,12 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" + "http://www.w3.org/TR/html4/loose.dtd"> +<html lang="en"> +<head> + <meta http-equiv="content-type" content="text/html; charset=iso-2022-jp"> + <title>Untitled</title> + <meta name="generator" content="BBEdit 10.5"> +</head> +<body> +Generated +</body> +</html> \ No newline at end of file diff --git a/src/test/java/org/olat/test/AllTestsJunit4.java b/src/test/java/org/olat/test/AllTestsJunit4.java index 6fbee4f9992927fe239d921d099ffead4d83238e..703a632e42c6379cbdb879862be8de9886b2a6b1 100644 --- a/src/test/java/org/olat/test/AllTestsJunit4.java +++ b/src/test/java/org/olat/test/AllTestsJunit4.java @@ -61,6 +61,7 @@ import org.junit.runners.Suite; org.olat.core.util.StringHelperTest.class, org.olat.core.util.FormatterTest.class, org.olat.core.util.EncoderTest.class, + org.olat.core.util.SimpleHtmlParserTest.class, org.olat.core.util.mail.manager.MailManagerTest.class, org.olat.core.id.context.BusinessControlFactoryTest.class, org.olat.core.id.context.HistoryManagerTest.class,