From d4f3d7cdb9503bde98abc898577b4a95c56322af Mon Sep 17 00:00:00 2001 From: gnaegi <none@none> Date: Fri, 8 Apr 2016 11:38:00 +0200 Subject: [PATCH] OO-1976 implement indexing of subpages in single pages, improve link parsing in html pages, adding testcases --- .../org/olat/course/nodes/SPCourseNode.java | 37 ----- .../search/service/indexer/LeafIndexer.java | 8 +- .../course/SPCourseNodeIndexer.java | 150 +++++++++++++----- .../course/SPCourseNodeIndexerTest.java | 75 +++++++++ .../java/org/olat/test/AllTestsJunit4.java | 1 + 5 files changed, 193 insertions(+), 78 deletions(-) create mode 100644 src/test/java/org/olat/search/service/indexer/repository/course/SPCourseNodeIndexerTest.java diff --git a/src/main/java/org/olat/course/nodes/SPCourseNode.java b/src/main/java/org/olat/course/nodes/SPCourseNode.java index dcdc6af3266..a087b7d5e16 100644 --- a/src/main/java/org/olat/course/nodes/SPCourseNode.java +++ b/src/main/java/org/olat/course/nodes/SPCourseNode.java @@ -27,8 +27,6 @@ package org.olat.course.nodes; import java.util.List; -import org.olat.core.commons.modules.bc.vfs.OlatNamedContainerImpl; -import org.olat.core.commons.modules.bc.vfs.OlatRootFolderImpl; import org.olat.core.gui.UserRequest; import org.olat.core.gui.components.stack.BreadcrumbPanel; import org.olat.core.gui.control.Controller; @@ -47,7 +45,6 @@ import org.olat.course.editor.StatusDescription; import org.olat.course.nodes.sp.SPEditController; import org.olat.course.nodes.sp.SPPeekviewController; import org.olat.course.nodes.sp.SPRunController; -import org.olat.course.run.environment.CourseEnvironment; import org.olat.course.run.navigation.NodeRunConstructionResult; import org.olat.course.run.userview.NodeEvaluation; import org.olat.course.run.userview.UserCourseEnvironment; @@ -232,38 +229,4 @@ public class SPCourseNode extends AbstractAccessableCourseNode { //there was a version 3 but all keys new in this version have been removed } } - - // Copy from BCCourseNode => Merge together - // /////////////////////// - /** - * @param courseEnv - * @param node - * @return the relative folder base path for this folder node - */ - // public static String getFoldernodePathRelToFolderBase(CourseEnvironment - // courseEnv, CourseNode node) { - // return getFoldernodesPathRelToFolderBase(courseEnv) + "/" + - // node.getIdent(); - // } - /** - * @param courseEnv - * @return the relative folder base path for folder nodes - */ - public static String getFoldernodesPathRelToFolderBase(CourseEnvironment courseEnv) { - return courseEnv.getCourseBaseContainer().getRelPath() + "/coursefolder"; - } - - /** - * Get a named container of a node with the node title as its name. - * - * @param node - * @param courseEnv - * @return - */ - public static OlatNamedContainerImpl getNodeFolderContainer(SPCourseNode node, CourseEnvironment courseEnv) { - String path = getFoldernodesPathRelToFolderBase(courseEnv); - OlatRootFolderImpl rootFolder = new OlatRootFolderImpl(path, null); - OlatNamedContainerImpl namedFolder = new OlatNamedContainerImpl(node.getShortTitle(), rootFolder); - return namedFolder; - } } diff --git a/src/main/java/org/olat/search/service/indexer/LeafIndexer.java b/src/main/java/org/olat/search/service/indexer/LeafIndexer.java index ce1ca25ecf4..56582d95265 100644 --- a/src/main/java/org/olat/search/service/indexer/LeafIndexer.java +++ b/src/main/java/org/olat/search/service/indexer/LeafIndexer.java @@ -30,6 +30,7 @@ import java.io.IOException; import org.apache.lucene.document.Document; import org.olat.core.CoreSpringFactory; +import org.olat.core.commons.modules.bc.vfs.OlatRootFolderImpl; import org.olat.core.util.WorkThreadInformations; import org.olat.core.util.vfs.VFSContainer; import org.olat.core.util.vfs.VFSLeaf; @@ -82,7 +83,12 @@ public abstract class LeafIndexer extends AbstractHierarchicalIndexer { String path = ""; VFSContainer parentContainer = leaf.getParentContainer(); while (parentContainer.getParentContainer() != null) { - path = parentContainer.getName() + "/" + path; + String name = parentContainer.getName(); + if (parentContainer instanceof OlatRootFolderImpl && name.equals("coursefolder")) { + // don't add the coursefolder to the path, the path is relative to the course folder + break; + } + path = name + "/" + path; parentContainer = parentContainer.getParentContainer(); } return path; diff --git a/src/main/java/org/olat/search/service/indexer/repository/course/SPCourseNodeIndexer.java b/src/main/java/org/olat/search/service/indexer/repository/course/SPCourseNodeIndexer.java index 14d11759a13..624ef12c181 100644 --- a/src/main/java/org/olat/search/service/indexer/repository/course/SPCourseNodeIndexer.java +++ b/src/main/java/org/olat/search/service/indexer/repository/course/SPCourseNodeIndexer.java @@ -40,9 +40,9 @@ import org.olat.core.util.FileUtils; import org.olat.core.util.vfs.VFSContainer; import org.olat.core.util.vfs.VFSItem; import org.olat.core.util.vfs.VFSLeaf; +import org.olat.core.util.vfs.VFSManager; import org.olat.course.ICourse; import org.olat.course.nodes.CourseNode; -import org.olat.course.nodes.SPCourseNode; import org.olat.course.nodes.sp.SPEditController; import org.olat.search.service.SearchResourceContext; import org.olat.search.service.indexer.LeafIndexer; @@ -62,7 +62,7 @@ public class SPCourseNodeIndexer extends LeafIndexer implements CourseNodeIndexe private final static String SUPPORTED_TYPE_NAME = "org.olat.course.nodes.SPCourseNode"; private final static boolean indexOnlyChosenFile = false; - private static final Pattern HREF_PATTERN = Pattern.compile("href=\\\"([^\\\"]*)\\\"", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE); + private static final Pattern HREF_PATTERN = Pattern.compile("href=\\\"(?!http:\\/\\/|https:\\/\\/|javascript:|mailto:|tel:|\\/|:|#|\\.\\.)([^\\\"]*)\\\"", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE); private static final String HTML_SUFFIXES = "html htm xhtml xml"; @Override @@ -75,19 +75,69 @@ public class SPCourseNodeIndexer extends LeafIndexer implements CourseNodeIndexe courseNodeResourceContext.setTitle(courseNode.getShortTitle()); courseNodeResourceContext.setDescription(courseNode.getLongTitle()); - VFSContainer rootContainer = SPCourseNode.getNodeFolderContainer((SPCourseNode) courseNode, course.getCourseEnvironment()); - String chosenFile = (String) courseNode.getModuleConfiguration().get(SPEditController.CONFIG_KEY_FILE); - // First: Index choosen HTML file - if (log.isDebug()) log.debug("Index chosen file in SP. chosenFile=" + chosenFile); + // The root of the configured single page. Depends on the configuration + // whether to follow relative links or not. When relative links are + // followed, the root is the course folder root, if not, it is folder + // where the configured file is in + VFSContainer rootContainer; + // The filename of the configured file relative to the rootContainer + String chosenFile; + + // Read the course node configuration + VFSContainer courseFolderContainer = course.getCourseEnvironment().getCourseFolderContainer(); +// String path = course.getCourseEnvironment().getCourseBaseContainer().getRelPath() + "/coursefolder"; +// VFSContainer courseFolderContainer = new OlatRootFolderImpl(path, null); + + + boolean allowRelativeLinks = courseNode.getModuleConfiguration().getBooleanSafe(SPEditController.CONFIG_KEY_ALLOW_RELATIVE_LINKS); + String fileName = (String) courseNode.getModuleConfiguration().get(SPEditController.CONFIG_KEY_FILE); + + // *** IF YOU CHANGE THIS LOGIC, do also change it in SinglePageController! *** + if (allowRelativeLinks) { + // Case 1: relative links are allowed. The root is the root of the + // course, the file name is relative to the root + rootContainer = courseFolderContainer; + chosenFile = fileName; + } else { + // Csae 2: relative links are NOT allowed. We have to calculate the + // new root and remove the relative path to the course folder form + // the file. + String startURI = ( (fileName.charAt(0) == '/')? fileName.substring(1) : fileName); + int sla = startURI.lastIndexOf('/'); + if (sla != -1) { + // Some subfolder path is detected, create basecontainer from it + String root = startURI.substring(0,sla); + startURI = startURI.substring(sla+1); + // Create new root folder from the relative folder path + VFSContainer newroot = (VFSContainer)courseFolderContainer.resolve(root); + newroot.setParentContainer(null); + rootContainer = newroot; + } else { + // No subpath detected, just use course base container + rootContainer = courseFolderContainer; + } + chosenFile = startURI; + } + + // First: Index configured HTML file + if (log.isDebug()) { + log.debug("-------------------- Indexing course node::" + courseNode.getIdent() + " " + courseNode.getShortName()); + log.debug("Config: allow relative links::" + allowRelativeLinks); + log.debug("Config: filename::" + fileName); + log.debug("Base dir::" + VFSManager.getRealPath(rootContainer)); + log.debug("chosenFile::" + chosenFile); + } VFSLeaf leaf = (VFSLeaf)rootContainer.resolve(chosenFile); if (leaf != null) { String filePath = getPathFor(leaf); if (log.isDebug()) log.debug("Found chosen file in SP. filePath=" + filePath ); + // Use inherited method from LeafIndexer for the actual indexing of the content doIndexVFSLeafByMySelf(courseNodeResourceContext, leaf, indexWriter, filePath); if (!indexOnlyChosenFile) { if (log.isDebug()) log.debug("Index sub pages in SP."); Set<String> alreadyIndexFileNames = new HashSet<String>(); alreadyIndexFileNames.add(chosenFile); + // Check if page has links to subpages and index those as well indexSubPages(courseNodeResourceContext,rootContainer,indexWriter,leaf,alreadyIndexFileNames,0,filePath); } else if (log.isDebug()) { log.debug("Index only chosen file in SP."); @@ -101,38 +151,49 @@ public class SPCourseNodeIndexer extends LeafIndexer implements CourseNodeIndexe return SUPPORTED_TYPE_NAME; } - private void indexSubPages(SearchResourceContext courseNodeResourceContext, VFSContainer rootContainer, OlatFullIndexer indexWriter, VFSLeaf leaf, Set<String> alreadyIndexFileNames, int subPageLevel, String rootFilePath) throws IOException,InterruptedException { + private void indexSubPages(SearchResourceContext courseNodeResourceContext, VFSContainer rootContainer, + OlatFullIndexer indexWriter, VFSLeaf leaf, Set<String> alreadyIndexFileNames, int subPageLevel, + String rootFilePath) throws IOException, InterruptedException { int mySubPageLevel = subPageLevel; - // check deepness of recursion - if (mySubPageLevel++ <= 5) { + // check deepness of recursion + if (mySubPageLevel++ <= 5) { List<String> links = getLinkListFrom(leaf); for (String link : links) { - if (log.isDebug()) log.debug("link=" + link); - if (!alreadyIndexFileNames.contains(link)) { - if ( (rootFilePath != null) && !rootFilePath.equals("")) { - if (rootFilePath.endsWith("/")) { - link = rootFilePath + link; - } else { - link = rootFilePath + "/" + link; - } + if (log.isDebug()) + log.debug("link=" + link); + if ((rootFilePath != null) && !rootFilePath.equals("")) { + if (rootFilePath.endsWith("/")) { + link = rootFilePath + link; + } else { + link = rootFilePath + "/" + link; } + } + if (!alreadyIndexFileNames.contains(link)) { VFSItem item = rootContainer.resolve(link); - if ( (item != null) && (item instanceof VFSLeaf) ) { - VFSLeaf subPageLeaf = (VFSLeaf)item; - if (log.isDebug()) log.debug("subPageLeaf=" + subPageLeaf); - String filePath = getPathFor(subPageLeaf); - doIndexVFSLeafByMySelf(courseNodeResourceContext, subPageLeaf, indexWriter, filePath); - alreadyIndexFileNames.add(subPageLeaf.getName()); - indexSubPages(courseNodeResourceContext,rootContainer,indexWriter,subPageLeaf,alreadyIndexFileNames,mySubPageLevel,rootFilePath); - } else { - if (log.isDebug()) log.debug("Could not found sub-page for link=" + link); - } + if ((item != null) && (item instanceof VFSLeaf)) { + VFSLeaf subPageLeaf = (VFSLeaf) item; + if (log.isDebug()) + log.debug("subPageLeaf=" + subPageLeaf); + String filePath = getPathFor(subPageLeaf); + + String newRootFilePath = filePath; + + doIndexVFSLeafByMySelf(courseNodeResourceContext, subPageLeaf, indexWriter, filePath); + alreadyIndexFileNames.add(link); + + indexSubPages(courseNodeResourceContext, rootContainer, indexWriter, subPageLeaf, alreadyIndexFileNames, mySubPageLevel, newRootFilePath); + } else { + if (log.isDebug()) + log.debug("Could not found sub-page for link=" + link); + } } else { - if (log.isDebug()) log.debug("sub-page already indexed, link=" + link); + if (log.isDebug()) + log.debug("sub-page already indexed, link=" + link); } } } else { - if (log.isDebug()) log.debug("Reach to many sub-page levels. Go not further with indexing sub-pages last leaf=" + leaf.getName()); + if (log.isDebug()) + log.debug("Reach to many sub-page levels. Go not further with indexing sub-pages last leaf=" + leaf.getName()); } } @@ -143,21 +204,30 @@ public class SPCourseNodeIndexer extends LeafIndexer implements CourseNodeIndexe if (HTML_SUFFIXES.contains(suffix)) { BufferedInputStream bis = new BufferedInputStream(leaf.getInputStream()); String inputString = FileUtils.load(bis, "utf-8"); - // Remove all HTML Tags - Matcher m = HREF_PATTERN.matcher(inputString); + // Remove all HTML Tags + if (log.isDebug()) log.debug(inputString); + extractSubpageLinks(inputString, linkList); + } + return linkList; + } + + /** + * Extract links to subpages from given page content + * @param pageContent HTML content + * @param linkList found links are added to this list + */ + public static void extractSubpageLinks(String pageContent, List<String> linkList) { + Matcher m = HREF_PATTERN.matcher(pageContent); String match; - while (m.find()) { - int groupCount = m.groupCount(); - if (groupCount > 0) { - match = m.group(1); // e.g. 'seite2.html' - if (!match.startsWith("http://")) { // TODO: Filter other url than http - linkList.add(match); - } - } + while (m.find()) { + int groupCount = m.groupCount(); + if (groupCount > 0) { + match = m.group(1); // e.g. 'seite2.html' + linkList.add(match); } } - return linkList; } + private String getSuffix(String fileName) { int dotpos = fileName.lastIndexOf('.'); diff --git a/src/test/java/org/olat/search/service/indexer/repository/course/SPCourseNodeIndexerTest.java b/src/test/java/org/olat/search/service/indexer/repository/course/SPCourseNodeIndexerTest.java new file mode 100644 index 00000000000..ee031c5d32a --- /dev/null +++ b/src/test/java/org/olat/search/service/indexer/repository/course/SPCourseNodeIndexerTest.java @@ -0,0 +1,75 @@ +/** + * <a href="http://www.openolat.org"> + * OpenOLAT - Online Learning and Training</a><br> + * <p> + * Licensed under the Apache License, Version 2.0 (the "License"); <br> + * you may not use this file except in compliance with the License.<br> + * You may obtain a copy of the License at the + * <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache homepage</a> + * <p> + * Unless required by applicable law or agreed to in writing,<br> + * software distributed under the License is distributed on an "AS IS" BASIS, <br> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br> + * See the License for the specific language governing permissions and <br> + * limitations under the License. + * <p> + * Initial code contributed and copyrighted by<br> + * frentix GmbH, http://www.frentix.com + * <p> + */ +package org.olat.search.service.indexer.repository.course; + +import java.util.ArrayList; +import java.util.List; + +import org.junit.Assert; +import org.junit.Test; + +/** + * Test the regexp to find links to subpages. + * + * @author Florian Gnägi, gnaegi@frentix.com, http://www.frentix.com + */ +public class SPCourseNodeIndexerTest { + @Test + public void testFindLinksRegexp() { + //VALID cases + // direct + Assert.assertEquals(findLink("<html><body><h1>asdf</h1>asdfkasdf <a href=\"gugs.html\" target=\"_blank\">yo man </a></body></html>"), "gugs.html"); + // subdirectory + Assert.assertEquals(findLink("<html><body><h1>asdf</h1>asdfkasdf <a href=\"blabler/gugs.html\" target=\"_blank\">yo man </a></body></html>"), "blabler/gugs.html"); + Assert.assertEquals(findLink("<html><body><h1>asdf</h1>asdfkasdf <a href=\"gruebel/blabler/gugs.html\" target=\"_blank\">yo man </a></body></html>"), "gruebel/blabler/gugs.html"); + Assert.assertEquals(findLink("<html><body><h1>asdf</h1>asdfkasdf <a href=\"video-2/index.html\" class=\"s_goto_link s_goto_video\">yo man </a></body></html>"), "video-2/index.html"); + // with valid keywords + + // INVALID cases + // absolute links + Assert.assertNull(findLink("<html><body><h1>asdf</h1>asdfkasdf <a href=\"/gugs.html\" target=\"_blank\">yo man </a></body></html>")); + Assert.assertNull(findLink("<html><body><h1>asdf</h1>asdfkasdf <a href=\"://gugs.html\" target=\"_blank\">yo man </a></body></html>")); + Assert.assertNull(findLink("<html><body><h1>asdf</h1>asdfkasdf <a href=\"http://www.openolat.org/gugs.html\" target=\"_blank\">yo man </a></body></html>")); + Assert.assertNull(findLink("<html><body><h1>asdf</h1>asdfkasdf <a href=\"https://www.openolat.org/gugs.html\" target=\"_blank\">yo man </a></body></html>")); + // relative links + Assert.assertNull(findLink("<html><body><h1>asdf</h1>asdfkasdf <a href=\"../gugs.html\" target=\"_blank\">yo man </a></body></html>")); + // selfreference + Assert.assertNull(findLink("<html><body><h1>asdf</h1>asdfkasdf <a href=\"#blub\" target=\"_blank\">yo man </a></body></html>")); + // other protocol handlers + Assert.assertNull(findLink("<html><body><h1>asdf</h1>asdfkasdf <a href=\"javascript:(void();)\" target=\"_blank\">yo man </a></body></html>")); + Assert.assertNull(findLink("<html><body><h1>asdf</h1>asdfkasdf <a href=\"mailto:info@openolat.org\" target=\"_blank\">yo man </a></body></html>")); + Assert.assertNull(findLink("<html><body><h1>asdf</h1>asdfkasdf <a href=\"tel:0435449000\" target=\"_blank\">yo man </a></body></html>")); + } + + /** + * Helper to make it simpler to call in testcase + * @param page + * @return + */ + private String findLink(String page) { + List<String> linkList = new ArrayList<String>(); + SPCourseNodeIndexer.extractSubpageLinks(page, linkList); + if (linkList.isEmpty()) { + return null; + } else { + return linkList.get(0); + } + } +} diff --git a/src/test/java/org/olat/test/AllTestsJunit4.java b/src/test/java/org/olat/test/AllTestsJunit4.java index 8eb6d093e18..dced4d98c41 100644 --- a/src/test/java/org/olat/test/AllTestsJunit4.java +++ b/src/test/java/org/olat/test/AllTestsJunit4.java @@ -156,6 +156,7 @@ import org.junit.runners.Suite; org.olat.modules.reminder.manager.ReminderRuleEngineTest.class, org.olat.properties.PropertyTest.class, org.olat.search.service.document.file.FileDocumentFactoryTest.class, + org.olat.search.service.indexer.repository.course.SPCourseNodeIndexerTest.class, org.olat.search.service.document.file.PDFDocumentTest.class, org.olat.search.service.document.file.OfficeDocumentTest.class, org.olat.core.commons.services.notifications.manager.NotificationsManagerTest.class, -- GitLab