CTCS_template.tex

% !TEX encoding = UTF-8 Unicode
\documentclass[10pt,parskip=half]{scrartcl}

\usepackage[utf8]{inputenc}
\usepackage[english]{babel}
\usepackage{amssymb,amsmath}
\usepackage{fullpage}
\usepackage{graphicx}
\usepackage{epstopdf}
\usepackage{xspace}
\usepackage{lmodern}
\usepackage{textcomp}
\usepackage{bm}
\usepackage{layouts}
\usepackage{url} 
\usepackage{enumitem}
\usepackage{multirow}
\usepackage[table,xcdraw]{xcolor}
\usepackage[percent]{overpic}
\usepackage{tikz}
\usepgflibrary{arrows}% for more options on arrows
\usepackage[colorinlistoftodos]{todonotes}
\usepackage{color,soul}
\usepackage{booktabs}
\usepackage{todonotes}
\usepackage[nottoc]{tocbibind} %to make the references appear in the ToC
\usepackage{subcaption}
\usepackage{hyperref} % For clickable references/links
\graphicspath{{./img/}}
\newcommand{\Kristina}[1]{\textcolor{violet}{\textbf{\textit{<#1>}}}}	% to mark ideas for the text
\newcommand{\Danielle}[1]{\textcolor{blue}{\textbf{\textit{<#1>}}}}
\newcommand{\Jamie}[1]{\textcolor{green}{\textbf{\textit{<#1>}}}}
\newcommand{\Philipp}[1]{\textcolor{purple}{\textbf{\textit{<#1>}}}}
\newcommand{\Valerian}[1]{\textcolor{orange}{\textbf{\textit{<#1>}}}}


\usepackage{lipsum}

\def\bf{\bfseries}

% start: code settings
\usepackage{listings}
\usepackage{xcolor}

\definecolor{codegreen}{rgb}{0,0.6,0}
\definecolor{codegray}{rgb}{0.5,0.5,0.5}
\definecolor{codepurple}{rgb}{0.58,0,0.82}
\definecolor{backcolour}{rgb}{0.95,0.95,0.92}

\lstdefinestyle{mystyle}{
    backgroundcolor=\color{backcolour},   
    commentstyle=\color{codegreen},
    keywordstyle=\color{magenta},
    numberstyle=\tiny\color{codegray},
    stringstyle=\color{codepurple},
    basicstyle=\ttfamily\footnotesize,
    breakatwhitespace=false,         
    breaklines=true,                 
    captionpos=b,                    
    keepspaces=true,                 
    numbers=left,                    
    numbersep=5pt,                  
    showspaces=false,                
    showstringspaces=false,
    showtabs=false,                  
    tabsize=1
}

\lstset{style=mystyle}

% end: code settings


\begin{document}

%----------------------------------------------------------------------------------------
% Title page
%----------------------------------------------------------------------------------------
	
	\begin{center}\Large
		\includegraphics[width=5cm]{uibk_logo_4c_cmyk.pdf}\\[5mm]
		
		\begin{large}\bf
			PS 703301 --  WS 2021/22\\
			Current Topics in Computer Science\\[5mm]
		\end{large}
		Final Report\\[20mm]
		{\titlefont \huge Shaping Knowledge Graphs}\\[20mm]
		 
		Philipp Gritsch \\
		Jamie Hochrainer \\
		Kristina Magnussen \\
		Danielle McKenney\\
		Valerian Wintner\\[35mm]
		
		supervised by\\
		M.Sc. Elwin	Huaman\\
		\vfill
	\end{center}
\thispagestyle{empty}	
\pagebreak
% ------------------------------------------------------------------------
\tableofcontents
\pagebreak
% ------------------------------------------------------------------------
\todo{remove this part} 
I added a comment colour for everyone. 
\Jamie{Comment colour Jamie}
\Danielle{Comment colour Danielle}
\Philipp{Comment colour Philipp}
\Valerian{Comment colour Valerian}
\Kristina{Comment colour Kristina}


\section{Introduction}
\label{introduction}
\todo{add introduction to KGs}
%Our task was to implement a framework for shaping \emph{knowledge graphs}. This consisted of three major steps. First of all, we had to fetch a subset of data using \emph{SPARQL} queries, see Section~\ref{fetchingdata}. After this, we had to infer constraints over this data set (see Section~\ref{generatingconstraints}). These were validated automatically in the last step, see Section~\ref{validatingconstraints}. In addition, we also implemented a front-end so that a user could interact with the given framework. \\


We used \emph{CommonCrawl} \Jamie{maybe too specific as first sentence in introduction} datasets as the base for the \emph{knowledge graph} which we wanted to assess. The data contained in those datasets is often inconsistent and might contain errors. In order to work with this data properly, it is necessary to shape the \emph{knowledge graph} in which this data is contained. This shaping is done by inferring constraints over the data and validating it based on this  constraints \Jamie{these constraints?}. Validating a graph against constraints gives important insight into the structure of the data. For instance, when all nodes of a type conform to constraints, then it may be useful to define these as required attributes for all future nodes to ensure uniformity in the data. Non conforming nodes may also deliver important insight into where information is missing. For example, if 99\% of nodes of a given type conform to some constraints, it may be worthwhile to investigate the remaining 1\% to see if they are missing necessary information or otherwise corrupt. \\ \Jamie{Introduction should also contain what we cover in the report (not only motivation) with refering to Sections to give a short overview.}\\


\section{Related Work}
\todo{Add thesis Werkmeister + RDF2Graph, also add another work, maybe from sources in thesis, done by Philipp}

\section{Approach}
%You may add any subsections you deem appropriate for your specific project. Some examples for your reference: Technology stack, Training strategy, Data, Experiments, etc.
Our framework \Jamie{do you mean web application? I thought framework is sth different} offers a way to evaluate a \emph{knowledge graph} in an automated way. For this, we used \emph{knowledge graphs} from the \emph{CommonCrawl} datasets as a basis. The \emph{knowledge graphs} are imported as a static file. After this, our framework infers constraints over this data set (see Section~\ref{generatingconstraints}). These are validated automatically in the last step, see Section~\ref{validatingconstraints}. The user can interact with this framework over the front-end, see Section~\ref{frontend}. These different steps were implemented and tested separately. Once this was done, we consolidated them. The structure of our project can be seen in Fig.~\ref{fig:uml}. \todo{update figure}

\begin{figure}[ht]
	\centering
	\includegraphics[scale=0.35]{kg_shapes_uml.pdf}
	\caption{UML diagram of the framework structure}
	\label{fig:uml}
\end{figure}


\todo{refer to the readme here? Or should this happen somewhere else?}
\todo{add reference to our github repo!}

\subsection{Technology Stack}
\Jamie{In general, it would be nice to have an introductory sentence at the beginning of each section}
The framework was implemented in \emph{Java}. We used \emph{Maven} as a project management tool. We also used \emph{Jena}, which offers an \emph{RDF} API as well as support for \Jamie{supports} \emph{SPARQL} queries and the \emph{ShEx} language. The front-end was implemented using \emph{Vue3}\cite{Vue} as a front-end framework and \emph{PrimeVue} as a library for the different \Jamie{UI} components. For the deployment of our application we use single virtual machine. Access to the front-end is done via a single \emph{Apache} server. The front-end accesses the back-end via an internal \emph{REST-API}. 


\subsection{Generating Constraints}
\label{generatingconstraints}
For the generation of constraints, we used the tool \emph{RDF2Graph} \cite{original_rdf2graph_git} and adapted it for our purposes. As input, \emph{RDF2Graph} takes a \emph{knowledge graph} from \emph{CommonCrawl}. The properties of the graph are read out with several \emph{SPARQL} queries. These properties are saved in a new \emph{RDF} graph. As output, we receive a graph containing constraints for the initial input data. We use \emph{RDF2Graph} queries to extract the constraints in \emph{ShEx} syntax.  
\missingfigure{ add query to graph (chosen by Philipp), e.g. multiplicy of argument etc.}

\subsubsection{Integrating RDF2Graph with our framework}
We implemented the following steps in order to integrate \emph{RDF2Graph} into our project. We added \emph{RDF2graph} to our framework so that they could be compiled together. In addition, we changed some of the initial parameters of the \emph{RDF2Graph}, since it originally was intended as a stand-alone application. As we are handling \emph{Models} \todo{Add explanation for Model? Maybe in glossary?} in our software, we changed the input to \emph{RDF2Graph} to a \emph{Model}. In our application, \emph{RDF2Graph} does not use any other storage apart from the \emph{Model} data structure. Previously,   such a Model needed to be created by \emph{RDF2Graph}, now it is provided by our framework. We did this so we could have full control over the files handled by \emph{RDF2Graph}. \emph{RDF2Graph} allows for multithreaded execution, which requires a thread pool. This thread pool was initially created by \emph{RDF2Graph}. In our framework, it is  provided by our application. In addition, resources which are used by \emph{RDF2Graph} had to be provided in a different way so that they are still available when running from a server environment. We also changed some of the queries. \emph{RDF2Graph} supports multiple output graphs, however, this did not work \todo{should we explain this in more detail?}. As we only work on one Model at a time, we only use one output graph. 
\todo{Add explanation of limit to this section?}


\subsection{Validating Constraints}
\label{validatingconstraints}
Given a \emph{RDF} graph and a set of constraints, the validation  consists of verifying that every node in the graph fulfils the requirements given in the constraints. A graph consists of several different types. Each of those types must conform to its definition outlined in the constraints. The results of the validation is be a boolean flag for every single node in the graph, indicating whether or not it conforms to its type's constraints. In case of nonconformity, a reason will be given. \\

In our code, this is implemented in the following way. As input, we receive a \emph{RDF} subgraph as well as a set of constraints. We use this to generate a \emph{shape map}, which contains all of the types which need to be validated. For the actual validation, the \emph{ShExValidator} provided by the \emph{Jena} library was used. \todo{add reference to Jena library here?} The validator requires a set of constraints defined in valid \emph{ShEx} syntax and a \emph{shape map}. The \emph{shape map} describes which types of nodes need to be validated against which \emph{ShEx} constraint definitions.

\missingfigure{add picture/code snipped of shapeMap} 

The class \emph{ShexValidationRecord} stores the result of the validation for every single node of the graph. Not only is the individual result of every node checked against its relevant constraints, but we also calculate the percentage of nodes that conform to their constraints. 

\subsection{Front-end}
\label{frontend}
We implemented a front-end where the user can choose a \emph{knowledge graph} as well as a type of knowledge graph \Jamie{and its type. (because the type depends on the selected KG)}. \todo{check whether this is what we are doing in the finished version} In addition, the user can also set a limit. \todo{explain this in more detail, maybe also put the explanation in query} As output, \emph{ShEx} constraints as well as a validation of those constraints are given. The constraints can be edited by the user and those edited constraints can be revalidated.  
If a node is deemed invalid, a reason is given, e.g. "Cardinality violation (min=1): 0" The user can download the subset of the graph which was validated. The interaction between user, front-end and server can also be seen in Fig.~\ref{fig:sequence}. 

\todo{explain how different limit influences data output}
\missingfigure{add screenshot of the front-end} 
\todo{update and scale sequence diagram and refer to it}


\begin{figure}[ht]
	\centering
	\includegraphics[scale=0.5]{kgshapes_sequence.pdf}
	\caption{Sequence diagram showing the interaction between web application, user and server}
	\label{fig:sequence}
\end{figure}


%\subsection{Citation}
%
%As a computer science student, you should read \cite{turing1950}. A useful \LaTeX\ companion is \cite{mittelbach2004}.
%
%\subsection{Table}
%\begin{table}[ht]
%	\centering
%	\begin{tabular}{lll}
%		\toprule
%		%Row1
%		X & Y1 & Y2\\
%		\midrule
%		%Row2
%		X1 & 1 &2\\
%		%Row3
%		X2 & 3 & 4\\
%		%Row4
%		X3 &5& 6\\
%		\bottomrule
%	\end{tabular}
%	\caption{This is a very simple table.} \label{tab:example1}
%\end{table}
%
%\begin{table}[ht]
%\centering
%	\begin{tabular}{lcc}
%		\toprule
%		%Row1
%		X & Y1 & Y2\\
%		\midrule
%		%Row2
%		X1 & \multicolumn{2}{c}{1}\\
%		%Row3
%		X2 & \multirow{2}{*}{2} & 3\\
%		%Row4
%		X3 && 4\\
%		\bottomrule
%	\end{tabular}
%\caption{This is another table.}
%\label{tab:example2}
%\end{table}
%
%You can refer to Table \ref{tab:example2} or Table \ref{tab:example1}.
%
%\subsection{Figure}
%
%\begin{figure}[ht]\usepackage{listings}
%	\centering
%	\includegraphics[width=100pt]{mannequin.jpg}
%	\caption{This is a figure.}
%	\label{fig:example}
%\end{figure}
%
%There is a beautiful figure (Fig.~\ref{fig:example}).

\section{Results}
Our framework automatically infers constraints and validates the given data based on those constraints. This can be done on two different \emph{CommonCrawl} datasets. The user can choose one of those datasets and a limit \todo{explain this limit in more depth, maybe in front-end?} using the front-end. User can also edit constraints. 

\missingfigure{Maybe add small figure that shows workflow of project here? Something similar like we did in presentation but more professional?}

\todo{describe results of benchmark tests here} 

\subsection{Future work}
\todo{Possible future work could be: more data sets, more possibilities for user inputs}
Our application currently only handles two different datasets. For future work, this could be expanded so that the framework could handle more and bigger datasets. Currently, the size of the datasets that can be handled is limited by the RAM on the virtual machine. One possible solution for this could be to only work on parts of the graph. 
One problem we encountered when handling datasets from \emph{CommonCrawl} was the quality of these datasets. Many datasets include \emph{non-unicode} characters, which are replaced by Jena with \emph{unicode} characters. This takes a lot of computing time. In addition, many files include invalid \emph{RDF} syntax or are otherwise damaged. This means that in order to handle additional datasets, some way of processing these datasets would have to be implemented. Processing could include filtering for broken files and invalid syntax and fixing this before handling the dataset in the framework. 
\todo{Should we add proper SPARQL endpoints here? Might not be possible?}
In addition, more possibilities for user interaction could be added. For instance, a feature could be added where a user can upload their own dataset and have it validated. 


\section{Evaluation}
\Jamie{Move the evaluation section before the results and future work.}
\todo{add benchmarks here}
\todo{check what Elwin said concerning Evaluation on meeting 20.01.2022}

\subsection{Methodology}
\Philipp{i labeled all the included graphics with h!, when we have finished the report we might want to make it so, that one image is on the top and one on the bottom, if 2 pages are on the same page, for example}
For taking the measurements the application was started locally on our hardware, to minimize side-effects of other applications running on the virtual machine where the live-instance is deployed. The JVM was additionally setup to use up to 16 GB of main memory for its heap to allow parallel queries without compromising the runtime of the executions, arising from extensive swap usage. 

\subsection{Runtime}

Figures \ref{fig:exec_times_per_limit} and \ref{fig:exec_times_per_triples} show our measurements we obtained by tuning the \emph{LIMIT} input parameter, therefore limiting the size of the start-node subset, from which connected nodes are fetched. All the measurements are shown in tables \ref{table:runtimes_wo_limit} and \ref{table:runtimes_w_limit}.

\begin{figure}[h!]
\begin{subfigure}{\textwidth}
\centering
\includegraphics[width=0.65\linewidth]{img/limit_legend.pdf}
\end{subfigure}
\begin{subfigure}{0.33\textwidth}
\includegraphics[width=0.9\linewidth]{img/Canal_limit.pdf} 
\caption{RDFType = Canal}
\label{fig:limit_canal}
\end{subfigure}
\begin{subfigure}{0.33\textwidth}
\includegraphics[width=0.9\linewidth]{img/RiverBodyOfWater_limit.pdf}
\caption{RDFType = RiverBodyOfWater}
\label{fig:limit_rbow}
\end{subfigure}
\begin{subfigure}{0.33\textwidth}
\includegraphics[width=0.9\linewidth]{img/Service_limit.pdf}
\caption{RDFType = Service}
\label{fig:limit_service}
\end{subfigure}
\caption{Execution times per RDFType, per size of start-node subset on RiverBodyOfWater dataset}
\label{fig:exec_times_per_limit}
\end{figure}

The results shown in \ref{fig:exec_times_per_limit} were to be expected. First of all, the runtime of constructing the desired subset of the graph is considerably larger than the time needed to create the \emph{ShEx} constraints, or to validate the constraints on the graph. This can also be seen in figure \ref{fig:exec_times_no_limit}, where we didn't provide any limit. 
Secondly, the smaller the \emph{LIMIT} the smaller the runtime. This becomes especially clear in figures \ref{fig:limit_canal} and \ref{fig:limit_service}.\\

To understand the behaviour shown in figure \ref{fig:limit_rbow}, we want to look at figure \ref{fig:exec_times_per_triples}, which shows the same runtimes, but grouped by the number of triples in the subgraph, on which the constraints are created. Unlike in figures \ref{fig:triple_canal} and \ref{fig:triple_service} the maximum number of triples (shown in the x-coordinate in figure \ref{fig:triple_rbow}), is 1769. 
This is also the amount of triples contained in the subgraph that we get without providing any limit. Therefore, providing a limit larger than 200 won't enrich the constructed graph, keeping the time almost constant in regards to the \emph{LIMIT} parameter. 

\begin{figure}[h!]
\begin{subfigure}{\textwidth}
\centering
\includegraphics[width=0.65\linewidth]{img/triple_legend.pdf}
\end{subfigure}
\begin{subfigure}{0.33\textwidth}
\includegraphics[width=0.9\linewidth]{img/Canal_triple.pdf} 
\caption{RDFType = Canal}
\label{fig:triple_canal}
\end{subfigure}
\begin{subfigure}{0.33\textwidth}
\includegraphics[width=0.9\linewidth]{img/RiverBodyOfWater_triple.pdf}
\caption{RDFType = RiverBodyOfWater}
\label{fig:triple_rbow}
\end{subfigure}
\begin{subfigure}{0.33\textwidth}
\includegraphics[width=0.9\linewidth]{img/Service_triple.pdf}
\caption{RDFType = Service}
\label{fig:triple_service}
\end{subfigure}

\caption{Execution times per RDFType, per number of triples on RiverBodyOfWater dataset}
\label{fig:exec_times_per_triples}
\end{figure}

Figure \ref{fig:exec_times_no_limit} shows the runtime without providing a limit for constructing the subgraph. Note the much larger runtime needed for querying the graph, despite resulting in the same amount of triples when providing a large enough \emph{LIMIT}. 

\begin{figure}[h!]
\begin{subfigure}{\textwidth}
\centering
\includegraphics[width=0.65\linewidth]{img/no_limit_legend.pdf}
\end{subfigure}
\begin{subfigure}{\textwidth}
\centering
\includegraphics[width=0.6\linewidth]{img/no_limit.pdf} 
\end{subfigure}
\caption{Execution times per RDFType of the RiverBodyOfWater dataset (containing 49915 triples)}
\label{fig:exec_times_no_limit}
\end{figure}

\subsection{Correctness}

\begin{figure}
\centering
\lstinputlisting{code_snippets/blank_nodes.ttl}
\caption{Blank Nodes in Turtle File}
\label{code:anon_nodes}
\end{figure}

\subsection{ShEx Validation}


\section{Conclusion}
\todo{Which challenges did we face during the implementation? (Maybe depth of SPARQL query, outdated RDF2Graph?)}
\todo{Did we achieve what we wanted to do? How well and reliably does the framework work?}


% ------------------------------------------------------------------------
% Bibliography
% ------------------------------------------------------------------------
\bibliography{bibliography}
\bibliographystyle{abbrv}

\appendix
\section{Contribution Statements}
Please write down a short contribution statement for each member of your group. You may evaluate the contribution along the three common categories:
i) conception (i.\,e., problem framing, ideation, validation, and method selection), ii) operational work (e.\,g., setting up your tech stack, algorithm implementation, data analysis, and interpretation), and iii) writing \& reporting (i.\,e., report drafting, literature review, revision of comments, presentation preparations, etc.).
\section{Appendix}
You may use appendices to include any auxiliary results you would like to share, however cannot insert in the main text due to the page limit. 


\begin{table}[!ht]
    \centering
    \begin{tabular}{|l|r|r|r|r|}
    \hline
        Rdftype & Triples & [$t_{graph}$] = ms & [$t_{shex}$] = ms & [$t_{validation}$] = ms \\ \hline
        Canal & 16961 & 360000 & 737 & 45 \\ \hline
        GeoCoordinates & 204 & 468000 & 585 & 4 \\ \hline
        RiverBodyOfWater & 1769 & 468000 & 613 & 15 \\ \hline
        Service & 7334 & 462000 & 618 & 19 \\ \hline
    \end{tabular}
    \caption{Execution times per RDF-Type, queried on full graph of the RiverBodyOfWater dataset (containing 49915 triples)}
    \label{table:runtimes_wo_limit}
\end{table}

\begin{table}
    \centering
    \begin{tabular}{|l|r|r|r|r|r|}
    \hline
        Rdftype & Limit & Triples & [$t_{graph}$] = ms & [$t_{shex}$] = ms & [$t_{validation}$] = ms \\ \hline
        Canal & 50 & 226 & 2420 & 923 & 44 \\ \hline
        Canal & 100 & 328 & 2260 & 709 & 13 \\ \hline
        Canal & 200 & 765 & 1740 & 637 & 8 \\ \hline
        Canal & 400 & 1588 & 2020 & 559 & 9 \\ \hline
        Canal & 800 & 3176 & 3270 & 661 & 8 \\ \hline
        Canal & 1600 & 6817 & 5030 & 654 & 17 \\ \hline
        Canal & 3200 & 13504 & 9100 & 736 & 33 \\ \hline
        Canal & 6400 & 16961 & 10790 & 665 & 25 \\ \hline
        RiverBodyOfWater & 50 & 192 & 2680 & 615 & 5 \\ \hline
        RiverBodyOfWater & 75 & 291 & 2490 & 586 & 4 \\ \hline
        RiverBodyOfWater & 100 & 1187 & 2700 & 624 & 7 \\ \hline
        RiverBodyOfWater & 200 & 1769 & 4890 & 643 & 17 \\ \hline
        RiverBodyOfWater & 400 & 1769 & 4840 & 641 & 8 \\ \hline
        RiverBodyOfWater & 800 & 1769 & 4980 & 619 & 18 \\ \hline
        Service & 50 & 615 & 1640 & 602 & 7 \\ \hline
        Service & 100 & 1022 & 1790 & 562 & 6 \\ \hline
        Service & 200 & 1852 & 2300 & 577 & 9 \\ \hline
        Service & 400 & 3041 & 2880 & 601 & 6 \\ \hline
        Service & 850 & 5050 & 5856 & 606 & 12 \\ \hline
    \end{tabular}
    \caption{Execution times per RDF-Type, limited size of start-node subset (using the RiverBodyOfWater dataset)}
    \label{table:runtimes_w_limit}
\end{table}


\end{document}